{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loading Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences\n",
    "# https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "data_dir = f'{os.getcwd()}/data'\n",
    "\n",
    "if not os.path.exists(data_dir):\n",
    "    os.mkdir(data_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pip install requests\n",
    "import requests\n",
    "\n",
    "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip'\n",
    "\n",
    "response = requests.get(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "b'PK\\x03\\x04\\n\\x00\\x00\\x00\\x00\\x00'"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response.content[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import zipfile\n",
    "\n",
    "from io import BytesIO\n",
    "\n",
    "\n",
    "with zipfile.ZipFile(file=BytesIO(response.content), mode='r') as compressed_file:\n",
    "    compressed_file.extractall(data_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_list = []\n",
    "\n",
    "for csv_file in ['imdb_labelled.txt', 'yelp_labelled.txt',  'amazon_cells_labelled.txt']:\n",
    "\n",
    "    csv_file_with_path = f'{data_dir}/sentiment labelled sentences/{csv_file}'\n",
    "    temp_df = pd.read_csv(\n",
    "        csv_file_with_path, \n",
    "        sep=\"\\t\", header=0, \n",
    "        names=['text', 'sentiment']\n",
    "    ) \n",
    "    df_list.append(temp_df)\n",
    "    \n",
    "df = pd.concat(df_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['text', 'sentiment'], dtype='object')"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2745, 2)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tarek/anaconda3/envs/scikitbook/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n",
      "  if __name__ == '__main__':\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWUAAAFUCAYAAAD8oy2zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAalElEQVR4nO3deZhtVX3m8e9iuKDI4FZRHBAVB8SIIqISO05xjkKi7dA2qCgOPNFWo7icQWPc0Ip2RCAOKM5phzhiEOeOOGBUoiCJIIOIxmHJJMPFy+4/9qlQXO5Qp+rs+u2z9/fzPOepU+fUfXjr8tR7V6291tqpaRokSf2wRXQASdJ1LGVJ6hFLWZJ6xFKWpB6xlCWpRyxlSeoRS1mSesRSlqQesZQlqUcsZUnqEUtZknrEUpakHrGUJalHLGVJ6hFLWYOSUjohpfTrlNKPo7NIy2Epa2jeBzwqOoS0XJayBqVpmm8AJTqHtFyWsiT1iKUsST1iKUtSj1jKktQjlrIGJaX0EeBbwF1TShemlJ4VnUmaRmqaJjqDJGnCkbIk9YilLEk9YilLUo9YypLUI5ayJPWIpSxJPWIpS1KPWMqS1CNbRQeQZiHnvDWwI7ANsO16j60XfWmz6GMDXAL8GvhtXdd/XLXA0ka4o0+9l3PeArgVsCtwu/UeC6/dEkgr+M80wO9pC/rXwG8WPV/8+a+AcyxwdcVSVm/knHcA9gXuB9yd6wr31lx/tBvtauBM4HTgh5PH6XVdXxyaSoNgKStEznlL4E9oC3jhcTfm+zrH+axX1MC5dV37Q6Yls5S1KnLOtwHuz3UFfB9gu9BQq+NS4HvAF4CT6ro+MziPes5SVicmI+EHA38F/AXtVITgPOCkyeMrdV1fGRtHfWMpa2ZyzmuAhwNPAB4P3Cw2Ue9dCXyNtqA/X9f1ubFx1AeWslYk57wd8GjaIn4MsENsorl2FvD5yePrdV1fG5xHASxlTS3nvBPwONoifgRwo9hEg3QhcCLw3rquz4kOo9VjKWvJcs4PBg4FDqBfS9SGrAG+AZwAfLyu6yuC86hjlrI2Kee8PXAQ8Hxgz+A4Y3cp8AHgmLquz4oOo25YytqgnPNdgP8FHAhsHxxHN/Rl4B3AZ+q6XhcdRrNjKet6cs4PBF5GO2e8km3LWh0XAMcBx9Z1fWl0GK2cpayFsyX+Engp7QYPzZ/fAUfSTm249nmOWcojl3N+MvC3wO7RWTQTv6T9//muuq6viQ6j6VnKI5VzfgBwNI6Mh+pc4AjgA653ni+W8sjknO8A1MCTorNoVZwJvBb4pAcjzQdLeSQmGz5eBbyA9iB4jcu/Aq+u6/qfo4No0yzlgcs5b0W7xvh1eBaF4P8Bf1PX9WnRQbRhlvKA5Zz3B44C7hKdRb2yDngb8BpXavSPpTxAOec9gGNpj86UNuanwLPruv5GdBBdx1IekJxzAl5IeyFv2+A4mg8N7eaTl9d1fXl0GFnKgzG5s8f7gD8PjqL5dD5wSF3Xp0QHGTtLeQAmG0COA24anUVz7wTgJXVdXxIdZKws5Tk2WeZ2LPDU6CwalIuA59V1/dnoIGNkKc+pnPNDaacrbhccRcP1EeDQuq4vjg4yJpbynMk5bwO8CXgRnuKm7p0N7O9duFePpTxHcs57AR8E7hGdRaNyGXBgXdefjg4yBpbynMg5/3fg/bjUTTEa4PXAEZ6h0S1LeQ7knF8BvBGnKxTvU8BBdV1fFh1kqCzlHss5bw0cDxwcnUVa5Ezaeeazo4MMkaXcU5Plbp8AHhqdRdqAi4Gn1HV9cnSQodkiOoBuKOd8R+BbWMjqr52Ak3LOL48OMjSOlHsm57wf7bzdLaKzSEv0UeBgT5ybDUu5R3LOTwHeiyssNH++DvyFhxqtnNMXPZFzfjXwYSxkzacHAV/KOXv+ygo5Ug42OW7zH4BDorNIM3A68PC6rn8THWReOVIONCnkd2Ihazj2Ar4xOUpWy2ApB5kU8vHAs6OzSDN2N9pi9rCsZbCUA0wK+VjgOdFZpI7cEfhKzvnW0UHmjaUc4xjgedEhpI7tDnw153yr6CDzxFJeZTnno4BDo3NIq+QutCPmnaODzAtLeRVNdj+9LDqHtMr2AL6cc755dJB54JK4VZJzPoR2pYU0VqcCD63r+uroIH3mSHkV5JyfSLvSQhqz/WjX5GsTLOWO5ZwfBnwI/64lgKfnnJ3C2wSnLzqUc74D8D2gis4i9ci1wAHeLXvDLOWO5Jy3pZ1Du3d0FqmHLgP2q+v6x9FB+sZfqbtzHBaytDHbA5/NOXtE7Xos5Q7knJ8HPCM6h9RzuwGfyDmviQ7SJ5byjOWc7w/8n+gc0pz4b7S/VWrCUp6hya6ljwP+yy8t3cE555dEh+gLL/TNSM55S+AU4CHRWaQ5dC3wuLquT4oOEs2R8uzUWMjScm0BnOgZGZbyTEx27L00Ooc0526OO/6cvlipnPMewHeBm0RnkQbiwLquPxgdIoqlvAI55y2AbwP3jc4iDcjFwJ51XV8UHSSC0xcr80IsZGnWdgLeHR0iiiPlZco57wqcgdMWUleeXdf1e6JDrDZHyst3HBay1KWjJ4OfUbGUlyHn/GTgMdE5pIHbAThhcqPh0bCUp5Rzviluo5ZWy8OA50eHWE2W8vT+N3DL6BDSiByVc75TdIjVYilPIef8YOBZ0TmkkdkOGM0FP0t5iSaH1nvjUynGg3LOB0SHWA2W8tK9GrhzdAhpxN40Ofhr0CzlJcg57wkcFp1DGrm7AQdHh+iapbw0bwS2jg4hicNzzjeODtElS3kzcs57A/tH55AEwK2BF0WH6JKlvHmHRweQdD2H5ZxvFh2iK5byJuSc9wEeF51D0vXsSHvhfZAs5U07PDqApA06NOe8W3SILljKG5Fz3hd4bHQOSRu0BnhDdIguWMobd3h0AEmb9LSc872iQ8yapbwBOef7A4+OziFpkxLtDYsHxVLesMOjA0hakkfmnO8THWKWLOX15JwfADwyOoekJRvUumVL+YaOiA4gaSpPyjnvEh1iVizlRXLO9wUeHp1D0lTWAIdGh5gVS/n6RnWHA2lAnjs5XnfuWcoTOecdgSdH55C0LLcAnhYdYhYs5escBAz69Clp4AYxhWEpX+e50QEkrcjeQ1geZykDOecHAntG55C0Ys+JDrBSlnLrkOgAkmbiqTnnm0SHWInRl3LOeTvgCdE5JM3E9sD/iA6xEqMvZeCvaG9hLmkY5vo3X0u5XXUhaTj2yTnfKTrEco26lHPOtwEeGp1D0sw9PjrAco26lGkXm4/970AaIkt5Tj01OoCkTjww51xFh1iO0ZZyzvlWwODuWiAJgK2Ax0SHWI7RljLwiOgAkjo1l1MYYy5lD7KXhu1ROec10SGmNcpSzjknPDdZGrrtgYdEh5jWKEsZuDftUX+Shm3upjDGWsrOJ0vjYCnPCeeTpXG4bc557+gQ0xhdKU8OINovOoekVbN/dIBpjK6UaSf+5+6KrKRlm6uL+mMsZeeTpXG5V855q+gQSzXGUnY+WRqXGwF3jw6xVKMq5ZzzrsBdonNIWnVzc+++UZUysG90AEkh9okOsFRjK+V7RAeQFMKRck9ZytI47TUvF/ssZUljsC2wZ3SIpRhNKeectwF2j84hKcxczCuPppSBuwFbRoeQFGYu5pXHVMpOXUjjNqyRckrp0xt5/ZOzi9MpS1kat3vmnLeODrE504yUN3ZY9INnkGM1zMUkv6TObMMc9MBml4iklF4/ebpm0fMFdwTOn3mqbjhSlnQn4IfRITZlKev2bjf5uMWi5wAN8HPg8BlnmrnJcZ27ReeQFO7W0QE2Z7Ol3DTNMwFSSqc2TfOu7iN1Yk8gRYeQFG6X6ACbs+QdLk3TvCultCNwV+Am6733lVkHm7E9ogNI6oX5HykvSCk9A3gHcDlwxaK3Gtq55T7bOTqApF4YzkgZeCPwxKZpvtBVmA5V0QEk9ULvR8rTLInbCvhiV0E6drPoAJJ6ofcj5WlK+Ujg1SmledwF6EhZEsDNcs69vkfnNNMXLwZuBRyWUvrd4jeaptl1pqlmz1KWtGAXery/YppS/p+dpeiepSxpwTBKuWmar3cZpGOWsqQFvb7YN82BRNuklN6YUvpZSumSyWuPSCn9dXfxZsZSlrSg1xf7prlo91ba8yOeRrs2GeAM4PmzDjVLk0n97aJzSOqNXpfyNHPKfwns3jTNH1JK1wI0TfOLlNJtuok2M46SJS124+gAmzLNSHkt65V4SukWwO82/OW9YSlLWqzXN1CdppQ/BpyYUroDQEppF+AY4KNdBJshS1nSYoMp5VcC5wI/AnYCfgpcBKx/xnLf7BAdQFKv9PruI9MsiVtLu4HkxZNpi982TdNs5o9JUt/0eqQ8VbiU0o2B3WmP7rxzSu0RxU3TnDr7aDOzLjqApF4ZRimnlA6inUNeC1y56K0G6PM2a0u5v64Frgaumnzs6vnC539cnW9LPVeiA2zKNP9iHAU8oWmaU7oK0xFL+foWinA1ynBDxfhfz+u6tiSl9UxTymuBr3WUo0t9+MFfXIShZVjX9TVdf7OSlm+aUn4NcHRK6YimaX7bVaAOXA38J4FlaBFKWqq01AUUKaUH0K5Jvu3il4GmaZotO8gmSaMzTSmfDXwE+Eeuf6GPpmnOmX00SRqfaUr590Dl2mRJ6s40O/reCxzYVRBJ0nQj5X8B9qXdav2fi99rmubPZh9NksZnmlJ++sbea5rmxJklkqQRW3IpS5K6t8l1yimlA5um+cDk+cEb+7qmaU6YdTBJGqNNjpRTSic1TfOYyfOvbuTLmqZpHtpFOEkaG6cvFK6qqq2AbYFtJo+unm9Nu+FJ4/azUsozokNszDSnxP2gaZp7b+D17zVNs89sY6lrkyLssgA393zx59MszZRWqtc3vpjm7Ivd138htQcq33F2cYatqqotiSvB9QvRrfEaqz4cUrZRmy3llNL7J0/XLHq+YDfgjFmHmrWqqm5MbAkuPLcIpXjzXcrAORt53gDfpL2ham9Nfk3/Q3QOSb3R61MbN1vKTdMcAZBS+nbTNCd3H2m2Sil/rKpqLbAmOoukXpj7kTIATdOcnFK6K7AX7T36Fr/X93XKlwNVdAhJvTDfI+UFKaVXAq8FTgeuWPRWA/S9lP+ApSyp9ZvoAJsyzeqLFwH7Nk3zb12F6dDl0QEk9cYvowNsyjTrQ68EzuoqSMcsZUkLBlPKrwHenlLaJaW0xeJHV+Fm6OLoAJJ646LoAJsyzfTF+yYfn73otUQ7p9z39bcXRAeQ1Bu9HilPU8p36CxF986NDiCpN4ZRyk3TnA8wma64ZdM0vf7G1mMpS1rQ6+5a8nxwSmmnlNKHgauAsyevPT6l9LddhZuh86IDSOqFK0opl0aH2JRpLtIdD1wC3B5YO3ntW8CTZx2qA46UJUHPR8kw3Zzyw4BbN01zTUqpAWia5jcppZ27iTZTv6Id4W8bHURSqF6vvIDpRsqXADdf/EJKaVfm4F+eUkoDnB+dQ1K43vfVNKX8buATKaWHAFuklB4AnEg7rTEPnMKQ1PtSnmb64kjaXX3voL2tzgm0hfz3HeTqgqUsqfd7FqYZKT8Y+EzTNHenvQvJacC9gFt2kKsL50UHkBTu+9EBNmeaUj4WWDd5/hbaUfa1wDtnHaojjpSlcWuYg1KeZvriNk3TXJBS2gp4FLAr7dK43l/NnLCUpXE7p+9rlGG6Ur40pXRL4B7AGU3TXJ5SWkM7vzwPfkI70u/7OR2SuvGv0QGWYprpi7fTziN/iPZiH8CfMifHeZZS/kB7QL+kcZqLUp7m7IsjU0r/BKxrmmbhBqq/4PqnxvXdqcDe0SEkhZiLUp7qLOSmaf5jUSEvfP6j2cfqzDejA0gK0/uLfDBlKQ/AqdEBJIX4WSllLm52MapSLqVcAFwYnUPSqpuLqQsYWSlPOFqWxsdS7jFLWRqfuZhPBktZ0vDNxU6+BWMs5R8AV0SHkLRqvl9K+V10iKUaXSmXUv5IuwlG0jh8NjrANEZXyhNOYUjjYSnPga9FB5C0Ki4spczNfDKMt5S/Cvw+OoSkzn0uOsC0RlnKpZRrgE9F55DUubmauoCRlvLE/40OIKlTVwBfiQ4xrTGX8pdxCkMaslNKKVdFh5jWaEvZKQxp8OZu6gJGXMoTH4sOIKkTDfD56BDLMfZS/hJOYUhDdFop5VfRIZZj1KU8mcL4dHQOSTM3l1MXMPJSnnAKQxqWa4EPRIdYLku5ncKYizsSSFqSL5RSzo8OsVyjL+VSylqcwpCG5LjoACsx+lKe+GB0AEkzcR7whegQK2EpA6WULwFnROeQtGLvLKVcGx1iJSzl67wtOoCkFVkLvCc6xEpZytf5IPDr6BCSlu2TpZS5/xm2lCcme+Tn+gKBNHKD+PlNTdNEZ+iNqqp2Bi4AtonOImkqZ5RS7hEdYhYcKS8y+dXnQ9E5JE3tH6IDzIqlfENvjQ4gaSp/AN4fHWJWLOX1lFJ+DJwSnUPSkn2wlHJJdIhZsZQ37OjoAJKW5CrgjdEhZslS3rCTgTOjQ0jarGNLKT+PDjFLlvIGlFIanFuW+u4y4E3RIWbNUt64E4GfRoeQtFFvKaX8NjrErFnKGzE5AP+w6BySNui3DPTaj6W8CaWUTwFfi84h6Qb+rpRyWXSILljKm/di2jsZSOqHC4Bjo0N0xVLejFLKD2nnlyX1wxGllKujQ3TFUl6aVwGXR4eQxFkMfJBkKS9BKeWXwJHROSTxmlLKuugQXbKUl+4twKAWqUtz5jTgE9EhumYpL1Ep5UrgFdE5pJG6BjhksrFr0Czl6XwY+G50CGmE6lLK6dEhVoOH3E+pqqr9gG9G55BG5MfAfUopa6ODrAZHylMqpZwKvCM6hzQS64BnjqWQwVJerpcBP4kOIY3Am0sp34sOsZqcvlimqqr2Br4NbB2dRRqos4B7DXmjyIY4Ul6mUsr3gddF55AG6lrg4LEVMljKK3Uk8C/RIaQBelsp5VvRISI4fbFCVVXtBpwO7BAcRRqKs4F7TvYGjI4j5RUqpZwHvDA6hzQQDfCssRYyWMozUUo5Efh4dA5pAN5cSvlGdIhIlvLsPBe4KDqENMe+iEcZWMqzUkopwDNpf/2SNJ1zgKcM/QS4pbCUZ6iU8kWgjs4hzZnLgf1LKb+PDtIHlvLsvQr4WHQIaU40wIGllDOig/SFpTxjk6MFD6Ld7Sdp014/uUGxJizlDpRSrgL2B86NziL12KeAI6JD9I2bRzpUVdUewKnATtFZpJ45E7h/KeWy6CB940i5Q6WUnwBPpL1rgqTW72kv7FnIG2Apd6yU8mXg+dE5pJ5YBzy1lHJ2dJC+spRXQSnlPbhUTgJ4YSnl5OgQfWYpr55X4lI5jdthpZRjo0P0nRf6VlFVVdsCXwXuH51FWmWvL6V4/vgSWMqrrKqqmwJfAvaOziKtkreUUl4aHWJeWMoBqqraCTgF2Cc6i9Sx40spXuiegnPKAUopFwN/Dnw3OovUoeOBQ6NDzBtHyoGqqtoROBm4X3QWacaOKaW8IDrEPHKkHKiUcgnwCLzPn4blrRby8lnKwUopl9IW80nRWaQZOKqU8pLoEPPMUu6Byf3IDgA+Ep1FWqYGeE0p5eXRQeadc8o9UlXVFsDb8eKI5ssVwNNLKd6ncgYs5R6qquoI4LXROaQl+AXw+FLK96ODDIWl3FNVVT0ZeA+wXXQWaSO+CxxQSvlldJAhcU65p0op/0i7Hfun0VmkDfgo8CALefYcKffcZC3z+4HHR2eRaC/ova6U8oboIEPlSLnnJmuZDwBeDVwbHEfjdgXwJAu5W46U50hVVY8EPgxU0Vk0OhfS3i3EC3odc6Q8RyaHg98H+EF0Fo3K14F9LeTVYSnPmVLKecB+wPtik2gErgReDDzEC3qrx+mLOVZV1fOAo4EbRWfR4HyHdkPIv0cHGRtHynOslHI88CfAl6OzaDDWAq8C/tRCjuFIeSCqqnoG8Ba8CKjl+zfgoFLK6dFBxsyR8kCUUt4H7EG7qF+axjrg74D7WsjxHCkPUFVVjwWOA24XnUW9dxbt3LF3wekJR8oDVEr5PHB34O9xw4k27BrgzcC9LeR+caQ8cFVV3Q94N3CP6CzqhQb4GPDKUso50WF0Q5byCFRVtTXwN8DLgZ2C4yjO14HDHBn3m6U8IlVV7QS8BHgRsH1wHK2eM4GXl1I+Fx1Em2cpj1BVVTcDXgq8AM9rHrKLgNcB7y2lrIsOo6WxlEesqqpb0E5pHIq7AofkMuAo4OhSyhXRYTQdS1lUVbUL8ArgOcA2wXG0fFfQ3q3mDaWU30SH0fJYyvovVVXdlnaL7cHAmuA4WrqfAe8ATiilXBwdRitjKesGqqq6PfBc4JnArYLjaONOob37+edLKa5HHwhLWRtVVdVWwGOBQ4BHAVvGJhJwOXAicEwp5azoMJo9S1lLMpnaeCbwLOD2wXHG6GzgGNqVFJdGh1F3LGVNpaqqLYCHA88G9ge2jk00aFcCJwPvAr5QSvGHdQQsZS1bVVU7A0+nHUHvERxnKC4GPgf8E/DPLmkbH0tZM1FV1Z2AR08eD8F1z9P4JfAp2iL+WinlmuA8CmQpa+aqqtoWeBDwGNqSvnNsol46m7aEPwl8x6kJLbCU1TlH0UC75fk7wLeBk0opPw7Oo56ylLWqJqPoPwP2AfaaPO7MsM72vhz4Hm0Jf5d2JPyL2EiaF5aywlVVdWPa8573WvS4J7BDZK4lWgf8iEn5Tj6e6WYOLZelrF6qqioBu3H90fTOwC0WfVyN5XiXAT/fyONC4PxSypWrkEMjYSlrbk3Oh15c0ut/3HbypWnRH0u0t8i6GrhqvceVwK9YVLyllEs6/0akRSxlSeqRIV1ckaS5ZylLUo9YypLUI5ayJPWIpSxJPWIpS1KPWMqS1COWskYrpfSolNK/p5TOTinl6DwSuHlEI5VS2hL4D9q7qFwInAY8tWmaM0ODafQcKWus9gXObprmZ03TrAU+Snt7KymUpayxug3t+RYLLpy8JoWylCWpRyxljdUvgNst+vy2k9ekUJayxuo04M4ppTuklNYATwE+E5xJYqvoAFKEpmn+mFL6a+BkYEvghKZpzgiOJbkkTpL6xOkLSeoRS1mSesRSlqQesZQlqUcsZUnqEUtZknrEUpakHrGUJalHLGVJ6hFLWZJ6xFKWpB6xlCWpRyxlSeoRS1mSesRSlqQesZQlqUf+P6iOlhZ23TnBAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 576x432 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "fig, ax = plt.subplots(1, 1, figsize=(8, 6))\n",
    "\n",
    "explode = [0.05, 0.05]\n",
    "colors = ['#777777', '#111111']\n",
    "df['sentiment'].value_counts().plot(\n",
    "    kind='pie', colors=colors, explode=explode, ax=ax\n",
    ")\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    0.504554\n",
       "0    0.495446\n",
       "Name: sentiment, dtype: float64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['sentiment'].value_counts() / df['sentiment'].shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.options.display.max_colwidth = 90"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>471</th>\n",
       "      <td>This is a stunning movie.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>278</th>\n",
       "      <td>I had the mac salad and it was pretty bland so I will not be getting that again.</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>The food, amazing.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>150</th>\n",
       "      <td>Audio Quality is poor, very poor.</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>430</th>\n",
       "      <td>His acting alongside Olivia De Havilland was brilliant and the ending was fantastic!</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                       text  \\\n",
       "471                                                             This is a stunning movie.     \n",
       "278        I had the mac salad and it was pretty bland so I will not be getting that again.   \n",
       "20                                                                       The food, amazing.   \n",
       "150                                                       Audio Quality is poor, very poor.   \n",
       "430  His acting alongside Olivia De Havilland was brilliant and the ending was fantastic!     \n",
       "\n",
       "     sentiment  \n",
       "471          1  \n",
       "278          0  \n",
       "20           1  \n",
       "150          0  \n",
       "430          1  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[['text', 'sentiment']].sample(5, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 207,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "df_train, df_test = train_test_split(df, test_size=0.4, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 208,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1647, 1098)"
      ]
     },
     "execution_count": 208,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.shape[0], df_test.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = df_train['sentiment']\n",
    "y_test = df_test['sentiment']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "vec = CountVectorizer(ngram_range=(1,3), min_df=3, strip_accents='ascii')\n",
    "\n",
    "x_train = vec.fit_transform(df_train['text'])\n",
    "x_test = vec.transform(df_test['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 211,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>token</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>20 minutes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1834</th>\n",
       "      <td>your money</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1835</th>\n",
       "      <td>your time</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1836</th>\n",
       "      <td>yourself</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1837</th>\n",
       "      <td>zero</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1838</th>\n",
       "      <td>zero stars</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1839 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           token\n",
       "id              \n",
       "0             10\n",
       "1             12\n",
       "2             20\n",
       "3     20 minutes\n",
       "4             30\n",
       "...          ...\n",
       "1834  your money\n",
       "1835   your time\n",
       "1836    yourself\n",
       "1837        zero\n",
       "1838  zero stars\n",
       "\n",
       "[1839 rows x 1 columns]"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(vec.vocabulary_.items(), columns=['token', 'id']).sort_values('id').set_index('id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1647x1839 sparse matrix of type '<class 'numpy.int64'>'\n",
       "\twith 19648 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 212,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 213,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1098x1839 sparse matrix of type '<class 'numpy.int64'>'\n",
       "\twith 12322 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 213,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Precision</th>\n",
       "      <th>Recall</th>\n",
       "      <th>F</th>\n",
       "      <th>Support</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.51</td>\n",
       "      <td>0.47</td>\n",
       "      <td>0.49</td>\n",
       "      <td>565</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.48</td>\n",
       "      <td>0.51</td>\n",
       "      <td>0.50</td>\n",
       "      <td>533</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Precision  Recall     F  Support\n",
       "0       0.51    0.47  0.49      565\n",
       "1       0.48    0.51  0.50      533"
      ]
     },
     "execution_count": 214,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.dummy import DummyClassifier\n",
    "\n",
    "dummy_clf = DummyClassifier(strategy=\"stratified\")\n",
    "\n",
    "dummy_clf.fit(x_train, y_train)\n",
    "\n",
    "y_dummy_test_pred = dummy_clf.predict(x_test)\n",
    "\n",
    "p, r, f, s = precision_recall_fscore_support(y_test, y_dummy_test_pred)\n",
    "\n",
    "pd.DataFrame(\n",
    "    {\n",
    "        'Precision': p,\n",
    "        'Recall': r,\n",
    "        'F': f,\n",
    "        'Support': s,\n",
    "    },\n",
    "    index=[0,1] \n",
    ").round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Precision</th>\n",
       "      <th>Recall</th>\n",
       "      <th>F</th>\n",
       "      <th>Support</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.81</td>\n",
       "      <td>0.78</td>\n",
       "      <td>0.79</td>\n",
       "      <td>565</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.77</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.79</td>\n",
       "      <td>533</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Precision  Recall     F  Support\n",
       "0       0.81    0.78  0.79      565\n",
       "1       0.77    0.80  0.79      533"
      ]
     },
     "execution_count": 222,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import precision_recall_fscore_support \n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "clf = MultinomialNB(fit_prior=True)\n",
    "clf.fit(x_train, y_train)\n",
    "y_test_pred = clf.predict(x_test)\n",
    "\n",
    "p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)\n",
    "\n",
    "pd.DataFrame(\n",
    "    {\n",
    "        'Precision': p,\n",
    "        'Recall': r,\n",
    "        'F': f,\n",
    "        'Support': s,\n",
    "    },\n",
    "    index=[0,1] \n",
    ").round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.81\n",
       "1    0.77\n",
       "dtype: float64"
      ]
     },
     "execution_count": 223,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(p).round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 229,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7897"
      ]
     },
     "execution_count": 229,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(p).round(4).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Macro Precision = 78.97% & Recall=78.99%\n"
     ]
    }
   ],
   "source": [
    "y_test_pred = clf.predict(x_test)\n",
    "\n",
    "p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred, average='macro')\n",
    "\n",
    "print(f'Macro Precision = {p:.2%} & Recall={r:.2%}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 231,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7896174863387978"
      ]
     },
     "execution_count": 231,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "precision_recall_fscore_support(y_test, y_test_pred, average='micro')[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 232,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Precision</th>\n",
       "      <th>Recall</th>\n",
       "      <th>F</th>\n",
       "      <th>Support</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.798214</td>\n",
       "      <td>0.791150</td>\n",
       "      <td>0.794667</td>\n",
       "      <td>565</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.780669</td>\n",
       "      <td>0.787992</td>\n",
       "      <td>0.784314</td>\n",
       "      <td>533</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Precision    Recall         F  Support\n",
       "0   0.798214  0.791150  0.794667      565\n",
       "1   0.780669  0.787992  0.784314      533"
      ]
     },
     "execution_count": 232,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import precision_recall_fscore_support \n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "clf = MultinomialNB(fit_prior=False)\n",
    "clf.fit(x_train, y_train)\n",
    "y_test_pred = clf.predict(x_test)\n",
    "\n",
    "p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)\n",
    "\n",
    "pd.DataFrame(\n",
    "    {\n",
    "        'Precision': p,\n",
    "        'Recall': r,\n",
    "        'F': f,\n",
    "        'Support': s,\n",
    "    },\n",
    "    index=[0,1] \n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7896174863387978"
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "precision_recall_fscore_support(y_test, y_test_pred, average='micro')[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Micro FScore = 0.7978142076502732 @ ngram_range = (1, 1)\n",
      "Micro FScore = 0.6520947176684881 @ ngram_range = (2, 2)\n",
      "Micro FScore = 0.5373406193078324 @ ngram_range = (3, 3)\n",
      "Micro FScore = 0.5136612021857924 @ ngram_range = (4, 4)\n",
      "Micro FScore = 0.5127504553734062 @ ngram_range = (5, 5)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tarek/anaconda3/envs/scikitbook/lib/python3.6/site-packages/ipykernel_launcher.py:37: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAApQAAAJZCAYAAAAXuaVWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAgAElEQVR4nO3de5hkd13n8c+XjAEFTVhRZJMIUZrVEV0QSLwtFwFJgE1AEBPXS1jxgV2jKF42rhrZqKu4j7LqBi+LXEQhRtbLoONGEbyAgAMSwSQbe4hIJnITmCCXJA589486g5Wmp7uSX890T+r1ep560uecX1X9qk5m8s45dbqquwMAALfXnbZ7AgAAHN8EJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCVwu1TVw6uqq+rU7Z4LANtLUAJJkqp60RSIv7XOtnOnbYfmVv9Fknsl+YdjNslbz6nXud00t/3eVfXiqrq+qm6uqndV1Sur6tHbMV+AO7Jd2z0BYEd5R5LHV9U9u/vdc+ufnuTvk3ziaGR335LkXbf3iaqqkuzq7n++vY+R5MIk/2duuafH/pQkr0xyfZJvyOx13TPJw5N85sDzbaqqPmXwNR0XqurE6d8BAEcogVtZTfL6JBccXlFVn5vk0UleOD9wvVPeVfX5VfXyqnp/VX2kqt5SVY+ftl1QVYeq6hFV9eYkNyd51LTtW6rq6qq6paoOVNWPVdUi/8N7Y3e/a+52OIK/KMl9k3xnd/95d/99d/9ld/9Ud182N99dVfUjVfW26SjmDVX183Pb71VVl1XVwar6aFX9SVU9eJ334HFV9ZrpCOnTpm0Pqqo/rKoPVdV7q+q3qureR3ohVfXjVXXtOut/oapeM/38GVX1wulo683T0def2eAx7zPN7ylV9XvTPrmuqi5YM+70aa43TY/57dNrff7cmLdP++V5VfW+JH8+rX9mVV05vc53Te/XvdZ5jx5bVa+b3sc3VdUXTbfXTPP6y6rafaTXAuxsghJY65eTPG06gpjMAumPMztCeURV9TmZnQY/Ock5Sb44yQ8n+fjcsDsleU6SZyX5giRvrKrHJXlBkpckuX+S70ny7Ul+ZOA1vGd63idX1YkbjPuV6bmenWR3kicluW56PZXkd6Z5Pj7JGUneneSPquoeax7np6fX9YVJXjGF0Z8meV2SByf56iQfm+57lyPM5cVJ7ldVZx5eUVV3TvL1SX51WvVjSb40yblJVqZt12zw+g77yekxviTJZUmeX1X3m3udv53kpCQPTfLvkzwuyQPXeZzvzOy9/fIkT51b/72Z7e8nJvnc6TnW+vEkP5jkQUluSfKyJL+Q2X4+vO6F69wPOB50t5ubm1uSvCiz08R3SfK+JI9IckKSA0m+NrOjlofmxj88s1PMp07LP5rZKfC7HuHxL5jG/7s16/88yeVr1j0zyUeTnLjBfDvJTUk+NHf74bntz5jWfTTJazMLvofMbb/v9BhPPsLjP3Lavntu3Z2TvDPJxWveg29a5728bM26Oyf5SJInbPCaXp/k0rnlJ0/zP3la/t0kL7oN+/Q+0/yeNbfuhCT/lOTp0/KjpzH3nRvzr6a5Pn9u3duT/PECz/nA6fFOWfMePWFuzNdN6540t+6J07q7bfefBTc3t9t+c4QSuJXuvimzo4XfltmRql1JXrHAXR+U5C+6+8ObjNu3ZvmLkvzZmnV/mlnYfv4mj/WDSR4wd7v08Ibu/sUkn5PZUcc/SvKwJG+oqv8yDfnS6Z9/eITH/qIk7+vuq+ce8+Ykb5i2zfvLNcsPSfLE6TTwh6rqQ5lF+l0yO7J4JC9O8vXTZ0CT5JuT7Onug9Py8zI76vo3VfWzVXV2VS3y9/iVc6/hY5kdZbzntGp3kn/s7v1zY96f5JNOv6/zOg+f0r5iOlX+T0leM21ae3r/r+d+PvzZ27ess+6zN3ktwA7kohxgPb+c5K+SnJbkhd39z/9yBnzIx6Zg3Srvng+htbr7Q0n2TrdnT58JvKSqnruFc0iStRF9p8yi/CfXGfu+DR7nsiT/M8njquq1Sc5K8oTDG7v7iukzrY/J7MjfryV5a1U9cgrFI1l78Uzn1h956g3uO+9Wr3Oay97MXuslSf4xswu3Xplk7UcN5i9U6g3WOdABxyF/cIFPMh2V25fkK5M8f5Phh70pyVdU1V1v49Ndldln9+Y9LLNTvW+7jY+1mWsyC52TMgvmJPmaDeb1mfMXikyfaTwzyd9s8jxvzOzzim/r7v1rbh840p2mba9I8k1Jzk/y/iRXrBnz/u5+WXc/PbMjyA/L7Cjj7XV1ks+qqk8cDa6quye53wL3fUiST03yXd392u6+Nv9y5BNYIoISOJLHJLlHdy8adc/L7O+U362qr5yuHH58VZ29yf1+IsmTquqiqrpfVT0ls4tkfrpv56+lqaoHVtUrpqub719Vn1dVX5/k+5O8trvfOx3Z/PUkz6uqb6zZFeoPqapnTg/zqsxO8b50ej33z+zClrtkdjHJRv57Zhfo/FpVnTG9F4+YTlN/3ib3/dXMLgJ6RpJfnz/yOF0J/rVV9W+qaiXJf8jsc6LvuC3vzxqvzOx09Eum1/9vMzvieCibH7lcncZ8z/Qan5Dk4oG5AMcpQQmsq7s/Mn2WbtHx70zyVZld8LE3syN8P55kw3Pl3b03yX9M8i2ZHfl7bmZx+t9u38yTzH7/5P4k/zWzC3LemtlFQy/O7Ar0w56a5Jcyu3r6msyudj59mldndrr5/yX5/cyO2H5Okkd39z9u8pquSfIVSe6W2RHGq5P878yO5h3c4K5J8gdJbswsSH91zbabMju1/Kb8y1HQs7v7xk0ec6O5dmYXxHw4swukfm+aw7XT821037ck+Y7Mfk/p1Zld7f1dt3cuwPGrZn+XAMBMVX16Zlf3/1B3//xm4wFclAOw5KrqnMxOcV+T2VXWP5LZqezLt3NewPFDUALwaZl99vE+mZ36flOSr+pbf/0mwBE55Q0AwJBjdoTyxhtvVK4AAMe5k0466ZMutnSVNwAAQwQlAABDBOUxsLq6ut1T4Bizz5eT/b587PPlY5+vT1ACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBkoaCsqrOq6tqq2l9VF62z/XOr6tVV9eaqektVPXbrpwoAwE60aVBW1QlJLk1ydpLdSc6vqt1rhv1Qksu7+4FJzkvyvK2eKAAAO9MiRyjPSLK/u6/r7luSXJbk3DVjOslnTD+flOQftm6KAADsZNXdGw+oenKSs7r7adPyNyU5s7svnBtzryR/mOTuSe6a5FHd/ab5x7nxxhs/8USrq6tb9gIAADi6VlZWPvHzSSedVGu379qi5zk/yYu6+6er6suTvKSq7t/dH99sUsfKySeffMyfc6c4ePDgdk9h6ayurm7Lv+dsL/t9+djny8c+X98ip7xvSHLa3PKp07p535rk8iTp7tcluUuSe2zFBAEA2NkWCcp9SVaq6vSqOjGzi272rBnzjiSPTJKq+sLMgvK9WzlRAAB2pk2DsrsPJbkwyRVJrsnsau6rquqSqjpnGvY9Sb6tqv46ycuSXNCbfTgTAIA7hIU+Q9nde5PsXbPu4rmfr07ylVs7NQAAjge+KQcAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIQsFZVWdVVXXVtX+qrpone3Praorp9vfVtXBrZ8qAAA70a7NBlTVCUkuTfLoJAeS7KuqPd199eEx3f3dc+O/I8kDj8JcAQDYgRY5QnlGkv3dfV1335LksiTnbjD+/CQv24rJAQCw8216hDLJKUmun1s+kOTM9QZW1b2TnJ7kVRs94Orq6qLzYwt4v7eH93052e/Lxz5fPsu4z1dWVjbcvkhQ3hbnJXl5d39so0GbTYqt5f0+9lZXV73vS8h+Xz72+fKxz9e3yCnvG5KcNrd86rRuPefF6W4AgKWySFDuS7JSVadX1YmZReOetYOq6guS3D3J67Z2igAA7GSbBmV3H0pyYZIrklyT5PLuvqqqLqmqc+aGnpfksu7uozNVAAB2ooU+Q9nde5PsXbPu4jXLz966aQEAcLzwTTkAAAwRlAAADBGUAAAMEZQAAAwRlAAADBGUAAAMEZQAAAwRlAAADBGUAAAMEZQAAAwRlAAADBGUAAAMEZQAAAwRlAAADBGUAAAMEZQAAAwRlAAADBGUAAAMEZQAAAwRlAAADBGUAAAMEZQAAAwRlAAADBGUAAAMEZQAAAzZtd0TgKPl5JNP3u4pbJuDBw9u9xQAWCKOUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwRFACADBEUAIAMERQAgAwZKGgrKqzquraqtpfVRcdYcxTqurqqrqqql66tdMEAGCn2rXZgKo6IcmlSR6d5ECSfVW1p7uvnhuzkuQHknxld3+gqj77aE0YAICdZZEjlGck2d/d13X3LUkuS3LumjHfluTS7v5AknT3e7Z2mgAA7FSLBOUpSa6fWz4wrZt3vyT3q6rXVtXrq+qsrZogAAA726anvG/D46wkeXiSU5P8WVV9cXcfXG/w6urqFj0ti/B+Lx/7fPt475ePfb58lnGfr6ysbLh9kaC8Iclpc8unTuvmHUjyhu7+5yR/V1V/m1lg7rs9k2Jreb+Xj32+PVZXV733S8Y+Xz72+foWOeW9L8lKVZ1eVScmOS/JnjVjfiezo5Opqntkdgr8ui2cJwAAO9SmQdndh5JcmOSKJNckuby7r6qqS6rqnGnYFUneV1VXJ3l1ku/r7vcdrUkDALBzLPQZyu7em2TvmnUXz/3cSZ413QAAWCK+KQcAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIbu2ewIAW+Xkk0/e7ilsm4MHD273FIAl5gglAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQ3Zt9wQAYMTJJ5+83VPYFgcPHtzuKWybZd3nyc7d745QAgAwRFACADBEUAIAMERQAgAwZKGgrKqzquraqtpfVRets/2CqnpvVV053Z629VMFAGAn2vQq76o6IcmlSR6d5ECSfVW1p7uvXjP0N7r7wqMwRwAAdrBFjlCekWR/d1/X3bckuSzJuUd3WgAAHC8W+T2UpyS5fm75QJIz1xn3pKp6aJK/TfLd3X39OmOSJKurq7dpkozxfi8f+3z52OfLxz5fTtu131dWVjbcvlW/2PwVSV7W3TdX1dOTvDjJV9/eSbG1vN/Lxz5fPvb58rHPl9NO3e+LnPK+Iclpc8unTus+obvf1903T4vPT/KgrZkeAAA73SJBuS/JSlWdXlUnJjkvyZ75AVV1r7nFc5Jcs3VTBABgJ9v0lHd3H6qqC5NckeSEJC/o7quq6pIkb+zuPUm+s6rOSXIoyfuTXHAU5wwAwA6y0Gcou3tvkr1r1l089/MPJPmBrZ0aAADHA9+UAwDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMAQQQkAwBBBCQDAEEEJAMCQhYKyqs6qqmuran9VXbTBuCdVVVfVg7duigAA7GSbBmVVnZDk0iRnJ9md5Pyq2r3OuE9P8swkb9jqSQIAsHMtcoTyjCT7u/u67r4lyWVJzl1n3I8meU6Sm7ZwfgAA7HC7FhhzSpLr55YPJDlzfkBVfWmS07r796vq+zZ7wNXV1ds0ScZ4v5ePfb587PPlY58vp+3a7ysrKxtuXyQoN1RVd0ryM0kuWPQ+m02KreX9Xj72+fKxz5ePfb6cdup+X+SU9w1JTptbPnVad9inJ7l/kj+pqrcn+bIke1yYAwCwHBYJyn1JVqrq9Ko6Mcl5SfYc3tjdN3b3Pbr7Pt19nySvT3JOd7/xqMwYAIAdZdOg7O5DSS5MckWSa5Jc3t1XVdUlVXXO0Z4gAAA720KfoezuvUn2rll38RHGPnx8WgAAHC98Uw4AAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQxYKyqo6q6qurar9VXXROtufUVVvraorq+o1VbV766cKAMBOtGlQVtUJSS5NcnaS3UnOXycYX9rdX9zdD0jyU0l+ZstnCgDAjrTIEcozkuzv7uu6+5YklyU5d35Ad39wbvGuSXrrpggAwE62a4ExpyS5fm75QJIz1w6qqm9P8qwkJyb56o0ecHV19TZMkVHe7+Vjny8f+3z52OfLabv2+8rKyobbFwnKhXT3pUkurapvSPJDSb7l9k6KreX9Xj72+fKxz5ePfb6cdup+X+SU9w1JTptbPnVadySXJXnCyKQAADh+LBKU+5KsVNXpVXVikvOS7JkfUFXzufy4JI7DAwAsiU1PeXf3oaq6MMkVSU5I8oLuvqqqLknyxu7ek+TCqnpUkn9O8oFscLobAIA7loU+Q9nde5PsXbPu4rmfn7nF8wIA4Djhm3IAABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGCIoAQAYIigBABgiKAEAGLJQUFbVWVV1bVXtr6qL1tn+rKq6uqreUlV/XFX33vqpAgCwE20alFV1QpJLk5ydZHeS86tq95phb07y4O7+kiQvT/JTWz1RAAB2pkWOUJ6RZH93X9fdtyS5LMm58wO6+9Xd/ZFp8fVJTt3aaQIAsFPtWmDMKUmun1s+kOTMDcZ/a5I/2OgBV1dXF3hator3e/nY58vHPl8+9vly2q79vrKysuH2RYJyYVX1jUkenORhG43bbFJsLe/38rHPl499vnzs8+W0U/f7IkF5Q5LT5pZPndbdSlU9KskPJnlYd9+8NdMDAGCnW+QzlPuSrFTV6VV1YpLzkuyZH1BVD0zyS0nO6e73bP00AQDYqTYNyu4+lOTCJFckuSbJ5d19VVVdUlXnTMP+R5K7JfnNqrqyqvYc4eEAALiDWegzlN29N8neNesunvv5UVs8LwAAjhO+KQcAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIYISAIAhghIAgCGCEgCAIQsFZVWdVVXXVtX+qrpone0Praq/qqpDVfXkrZ8mAAA71aZBWVUnJLk0ydlJdic5v6p2rxn2jiQXJHnpVk8QAICdbdcCY85Isr+7r0uSqrosyblJrj48oLvfPm37+FGYIwAAO9giQXlKkuvnlg8kOXPkSVdXV0fuzm3k/V4+9vnysc+Xj32+nLZrv6+srGy4fZGg3HKbTYqt5f1ePvb58rHPl499vpx26n5f5KKcG5KcNrd86rQOAAAWCsp9SVaq6vSqOjHJeUn2HN1pAQBwvNg0KLv7UJILk1yR5Jokl3f3VVV1SVWdkyRV9ZCqOpDk65L8UlVddTQnDQDAzrHQZyi7e2+SvWvWXTz3877MToUDALBkfFMOAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMEJQAAQwQlAABDBCUAAEMWCsqqOquqrq2q/VV10Trb71xVvzFtf0NV3WerJwoAwM60aVBW1QlJLk1ydpLdSc6vqt1rhn1rkg90932TPDfJc7Z6ogAA7EzV3RsPqPryJM/u7sdMyz+QJN39E3NjrpjGvK6qdiV5V5LP6rkHv/HGGzd+IgAAdryTTjqp1q5b5JT3KUmun1s+MK1bd0x3H0pyY5LPvH3TBADgeOKiHAAAhuxaYMwNSU6bWz51WrfemAPTKe+TkrxvfsB6h0cBADj+LXKEcl+Slao6vapOTHJekj1rxuxJ8i3Tz09O8qre7MOZAADcIWx6hLK7D1XVhUmuSHJCkhd091VVdUmSN3b3niS/kuQlVbU/yfszi04AAJbApld5AwDARlyUA1ugqr6gqh5ZVXdbs/6s7ZoTR19VnVFVD5l+3l1Vz6qqx273vDg2qupXt3sOHFtV9VXTn/Ov2e657DSOUB5DVfXU7n7hds+DrVVV35nk25Nck+QBSZ7Z3b87bfur7v7S7ZwfR0dV/UhmX/iwK8kfJTkzyauTPDrJFd3949s4PbZYVa29dqCSPCLJq5Kku8855pPiqKuqv+zuM6afvy2zv+t/O8nXJHlFd//kds5vJxGUx1BVvaO7P3e758HWqqq3Jvny7v7Q9LWjL0/yku7+2ap6c3c/cFsnyFEx7fcHJLlzZl/mcGp3f7CqPjXJG7r7S7Z1gmypqvqrJFcneX6SziwoX5bpmoHu/tPtmx1Hy/zf4VW1L8lju/u9VXXXJK/v7i/e3hnuHIv82iBug6p6y5E2JbnnsZwLx8yduvtDSdLdb6+qhyd5eVXdO7P9zh3Toe7+WJKPVNXbuvuDSdLdH62qj2/z3Nh6D07yzCQ/mOT7uvvKqvqokLzDu1NV3T2zjwhWd783Sbr7w1V1aHuntrMIyq13zySPSfKBNesryV8c++lwDLy7qh7Q3VcmyXSk8vFJXpDE/73ecd1SVZ/W3R9J8qDDK6vqpCSC8g6muz+e5LlV9ZvTP98d/w1dBicleVNm/w3vqrpXd79z+ry8AwZz/GHYer+X5G6H42JeVf3JsZ8Ox8A3J7nV/6lOX0H6zVX1S9szJY6Bh3b3zcknYuOwT8m//F5e7mC6+0CSr6uqxyX54HbPh6Oru+9zhE0fT/LEYziVHc9nKAEAGOLXBgEAMERQAgAwRFACADBEUAIAMERQAmyBqjphu+cAsF0EJbDUqurtVfW9VfWWqrqxqn6jqu4ybfv+qnpnVf1DVT2tqrqq7jtte1FV/UJV7a2qDyd5RFU9rqreXFUfrKrrq+rZc89zn+n+TzP/fGIAAAHMSURBVJ22faCqnlFVD5me+2BV/a/teRcAxvg9lADJU5KcleSmJK9NckFVvT3Js5I8MsnfJfnlde73DUkem+TxSU5M8mWZ/V7Sq5LcP8kfVdWV3f07c/c5M8lKkocm2ZPk/yZ5VGa/v/LNVfWbvn0FON44QgmQ/Fx3/0N3vz/JKzL7ju6nJHlhd181fRvOs9e53+9292u7++PdfVN3/0l3v3Vafktm3/X8sDX3+dFp7B8m+XCSl3X3e7r7hiR/nsR3vwPHHUEJkLxr7uePJLlbkn+d5Pq59dfnk91qXVWdWVWvrqr3VtWNSZ6R5B5r7vPuuZ8/us7y3W7j3AG2naAEWN87k5w6t3zaOmPWftXYSzM7jX1ad5+U5Bfj+36BJSAoAdZ3eZKnVtUXVtWnJfnhBe7z6Une3903VdUZmX3GEuAOT1ACrKO7/yDJzyV5dZL9SV4/bbp5g7v95ySXVNU/Jbk4sygFuMOr7rVnbABYq6q+MMnfJLlzdx/a7vkA7CSOUAIcQVU9saruXFV3T/KcJK8QkwCfTFACHNnTk7wnyduSfCzJf9re6QDsTE55AwAwxBFKAACGCEoAAIYISgAAhghKAACGCEoAAIb8fzqd3FK6tB9iAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 720x720 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "micro_fscores = []\n",
    "\n",
    "for ngram_range in [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]:\n",
    "\n",
    "    pipeline = make_pipeline(\n",
    "        CountVectorizer(ngram_range=ngram_range, min_df=3, max_df=0.2, strip_accents='ascii'),\n",
    "        MultinomialNB(fit_prior=False)\n",
    "    )\n",
    "\n",
    "    pipeline.fit(df_train['text'], y_train)\n",
    "    y_test_pred =  pipeline.predict(df_test['text'])\n",
    "\n",
    "    p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)\n",
    "\n",
    "    micro_fscore = precision_recall_fscore_support(y_test, y_test_pred, average='micro')[2]\n",
    "\n",
    "    print(f'Micro FScore = {micro_fscore} @ ngram_range = {ngram_range}')\n",
    "    \n",
    "    micro_fscores.append([ngram_range[-1], micro_fscore])\n",
    "    \n",
    "fig, ax = plt.subplots(1, 1, figsize=(10, 10))\n",
    "\n",
    "pd.DataFrame(\n",
    "    micro_fscores,\n",
    "    columns=['ngram', 'Micro FScore']\n",
    ").set_index('ngram')['Micro FScore'].plot(\n",
    "    title='Micro FScore vs ngram',\n",
    "    color='k',\n",
    "    kind='bar',\n",
    "    ax=ax,\n",
    ")\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tarek/anaconda3/envs/scikitbook/lib/python3.6/site-packages/ipykernel_launcher.py:53: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA5YAAAHwCAYAAADdOfAQAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAgAElEQVR4nOzde5zPdf7///vDZAyNY7Rq0XEcU5QzkbXJIWFQaqtZ0a6kX9/Vkk9JbbFJ2t2SEtFhV0mFaKllrcMYUisqWc0wLBp85Hwch+fvj9d73p/3DDNm5jmMMbfr5TIX8zq8X6/nPDGP9/31er6eb3POCQAAAACA/CpR2A0AAAAAABRtBEsAAAAAgBeCJQAAAADAC8ESAAAAAOCFYAkAAAAA8EKwBAAAAAB4IViiWDCzX5vZ8Tzs/7aZzT+bbULAzBaa2ZsRyxeZ2WQz+8nMnJndUojNAwCcY2Z2S+j3f7XQ8pWh5VaF3bbz1en6yMzqm9kKMztiZhsLsXkoJgiWOGdCYc2Fvo6b2SYzG29ml5yD038g6ed52P9RSb3OUluQsx6S7pHURdJlkpIKtzkAcOHLUqNPmNkWM3vXzPJSO3F+GS1pn6TakhoXcltQDBAsca4tURAWrpT0/ykIEe9mt7OZRRfESZ1zh51z2/Ow/17n3O6CODfyLE7SVudcknNum3MuvbAbBADFREaNrqHgAl9DSR8WaovgI07SIufcRufc/xZ2Y3DhI1jiXEsPhYUtzrlPJP1FUgczKx0xjONXZjbHzA5Kek6SzOxaM/vYzPaY2W4z+4eZ1Y88sJndZGafmdk+MzsQGv7RNLQt01BYMytnZm+Z2TYzO2pmm83sTxHbMw2FtcDvzWyDmaWb2Xoz+39Zzr/RzJ41s5fNbJeZbTezP5vZRdl1hpktNbMJp1m/1sxGhL6vZ2afh372g6Ft9+VwzF+H7gi3NbNvzexwaLjp5WbW2sy+Dh1nfuSVaDO7ysymm9mPZnYo9Nr7IrZXCvXTyxHrLjWzNDP7Y3btydK2K0J/R4dDx3oky/aFCv7Orw79W9iYm+MCAApERo3e6pxbLGmCpOZmVi5jBzMraWbPmFlqaIjlGjP7beRBzCzWzP4S+j1/NFQfn4jYPjJUyw6F9hlvZuXz22gziwvVjBZZ1jcNrY8LLfcLnfdIqE4vttBw22yOu9DMJpnZCDPbEarDI82shJkND9X5/zWzkVled4+ZfWFme81sp5n93cxqRmy/M/ReoknEuvtDtfH6XP7Md5pZSuhnSZJ0fcS2K83MSbpG0rOhPngmN8cFfBAsUdgOK/h3GBm+XpA0RdJ1ksab2c8kJUraIelmSc0krZO00MyqSEH4krRY0m5Jv1BwlfXPyv7f+AhJN0rqquCK3l2S1ubQzgEKAs8oSfUkvShplJn1zbLfI5LSJDUNfT9QUkIOx31HUi8zK5WxIlRoauv/7uS+L+knSS0k1Zc0KPRz5qSEpKcl9ZPUUsEw4A8kPSvpodC6apL+FPGaWEkLJHUMnWeCpLfMrK0kOed2SfqVpAFm1sXMTNJfJaVKGn6G9ii0/wxJl0i6RcFQ1zsU/D1kiJf0kqSNCq6aM3QHAAqBmV0uqaekE6GvDBMV/K7+raQ6CurKCxn1MPS7/lMFv98fCe1zv6TIO2aHJf1GUl1Jv1ZQE17Jb1udc8mSlknKetE1QdIy51yymd0kabyk5yXVktRGOYyYitBTUklJrRTU3yck/V1BzbxZ0u8lPWFmHSNeU0r/9z7jVgX993cLjcJyzk1TUP/ft+BCd01J4yQ95pz75kwNMrOGCt4bfCjpBkljJL0csctmBTV0i4L3VJeF9gHOLuccX3ydky9Jb0uaH7FcV9J6SctDy1dKcpKeyvK6ZzL2iVhnodf+v9DyXyWtllQim3P/WtLxiOVPJL2dh7ZuljQ6yz5/lrQhYnmjpFlZ9pkr6f0czlNBQYHtFbHuVQWFMGN5r6Rf56Gffx3qxwYR6waH1t0Use53knae4VifSJqYZd3TknYqCIC7JV2Ry3b9MtSGmhHrqoR+/jez/H2nFPa/V7744ouv4vQVqnvHJR2QdCj0+9pJGhOxz1WSTkqqneW1wyWtCn3fLvS6Rnk4d3dJRzNquIKg6SRVCy1nvD9olcMx+kvaJSk6tByt4KLsbyPOsVdSuTy0a2HGzxWxbo2kb7OsWx3ZT6c5TqVQ+1tGrCsTOtY0SV9LmpGHdv1N0tIs6wZm7aPQ+5Jhhf1vi6/i88UdS5xrt1gwTPWwpO8kbVDwHEekFVmWG0u6KfS6A2Z2QNJ+BYUmLrTPTZL+6Zw7mct2vCapp5l9Z8HQ1Y5mdtr/D6EhQNUU3BGNtEjSlWZWJmLdqiz7/CjpZ9k1wjm3R9Isha6ymllJSb2V+SrqGElvhobkPGNmN556pFMPLenbiOVtoT+/ybLuEjOLCp27jJmNCg1r2hXq506Srshy7Ock/aDgym1/59ymXLRHCi4k7HTO/RBuZPDMx7pcvh4AcHZ9IamBpCYKftcvkzQsYnsjBRd2v8pSk59Q5nq82zn3VXYnMbP40DDUH0Ovn6IgCFb1aPsHCsLa7aHl2yVdHFovSfMUvOdINbOpZvYbM6uci+OuzrK8TZlraca6SzMWzKyBmc0IDRfeL+m/oU3heuqcO6RgtFR86LVZR0DlpK5OndguMQ+vB84KgiXOtYyiVUdSjHPuVufchiz7HMyyXELSP0Ovi/yqpeDuVp455z5XMDnBSEkxCq7+LcgIWR6yTjTjdOb/Z+8qeM60iqTOCobXTI1o63OSaiq4qnmdpOUWev4yByedc5FDl1zoWMeyrlPwJkEKhvfeK+kPktoq6OM5Cop9pMtC7TkR+hMAcGE47JxLcc5955wbruBRh7ER2zPqWQtlrsfXKeIZv5xYMPfBhwou1nZXMFy0f2hzvifsc8GEe7MVDLtV6M9ZoQu4cs4dUBCMuyu4ONpfUkpoiGxOjmVZdtmsKyEFF2kl/SO0ro+CkN44tJz158v4aJDyCkbwAEUawRLnWkbR2uhyP9vnVwqea9wSem3kV8YzG/+W1C67u46n45zb5Zx73zn3WwWBro2Cq4BZ99un4DmF1lk2tZGUGrrq6ONzBcN3eisohJ+6LDPSOuc2OOdec871VDDk6CHPc55Oa0lTnHPTnHOrFVzZzRQcQ/07RcEV3LskDc86WUIOvpdUOWMShdDxKiu4QAAAOP88I6mPmTUKLf879GeN09Tj9RH7VIx4TVatFIxeGeac+yI0iiXbCXTy6B1JncysloIRN5meoXTOnXDOLQ6F5psUzImQddSUrzoKQuKTzrmFzrm1kirq/y7iSpLM7DoF8xz0kzRf0tTI+RbO4HsF4T5SS69WAwWAYImi4FVJUZI+MbObQ7OdtQrNzJbxi3W0gmE4U8yskZldY2a9zKz56Q4Yem28mdUKBZ1fKXiu5L+n21/Bw/6PmNmDFsw+91sF4S5Xs6HmxDl3XNJ7oeN1VlAYM9oZa2bjzOwXFsza2lBSBwVFpaCtk9TVzJqYWV0Fk/dcnmWfJxWE/Puccx+H9nnPzCrk4vj/VBBI/xY6RwMFITXrlV8AwHnABZPizFYwukfOuRRJkyVNNLP7LJix/QYze8DMHg+9bIGCjy35wMy6hmpXSzPrF9q+TlIVM+trZleb2f0KJsgrCJ8pePZ/aujPzzI2hNryOwtmkK8hqZuk6ir4erpJwfOij4Tei7RTMLFOxighmVmMgsl3Zjrn3pb0gKTKCt7L5MafFczWO9LMappZd0mPFeDPAOQLwRLnPRd8/mRzBRPGTFdQlKYoeFYhLbTPtwoe9q+i4NnHVQp+yZ449YiSpCMKZrL7t4I7otdL6uic25vN/q8ruFP4hIIi9Likoc65SX4/Xdg7Cq5y7lUw4U+G4wqudE5SMGvt55K2q+CvsErBZD6bJP1LQQjcKumjjI2hED9c0gPOuR9Dqx8LtfmUj0zJyjnnFBTyvQqGQH2qYKjtyoL7EQAABexFSe3N7JbQ8m8UBJsnFdTDfyqYfXWDFP5d31nB7/fxCmr23xQEJznnPlUQVP+oYC6A3gommPMWcaG2gaT3QssZdiuYjfwzBUNhR0saUYB1PKMNOxU8VnKrgsl5xiiYOTZyDog/K3j+s3/oNbsU1PUBZtY5F+f4d2j/3gr6cKiCGg4UKgv+/wMAAAAAkD/csQQAAAAAeDljsDSzyWa2w8y+y2a7mdkrZpZiZt/k8qMQAFxgzGx85PTzWb7WFHb7gKKE2gsUXznU0gNm9kRhtw/IzhmHwppZawWTmrzrnLvuNNs7SXpEwexbTSW97JxrehbaCuA8ZmaXSiqXzeZjefi8S6DYo/YCxZeZXZvD5l2hZzKB885FZ9rBObfYzK7MYZeuCgqfU/D5ehXM7DLnXFrkTnv37uVhTuACtmfPnhy3792b3bxIwJmVL1/ezrzXhaMgai91FyiaqKc4X+S19hbEM5Y/l7Q5YnlLaB0AADg7qL0AgPMKk/cAAAAAALyccShsLmxV8AGzGaqF1qEAJScnKy4urrCbUWTRf37ov/yj73CW5Kn2VniLsgwAOLM9ffI/+KUg7ljOknR/aIa6ZpL2Zn2+EgAAFChqLwDgvHLGO5Zm9r6kWyRVNrMtkp6WVFKSnHPjJc1RMCtdiqRDkvqcrcYCAFAcUHsBAEVNbmaFvfsM252khwusRQCKDeecDhw4oJMnTxZ2U86amJgYZvDLhRIlSig2NlZmxWry12xRewGcb0xOD1wp1SwXJX5VF33OST/sO6HJGyWngvkLLYhnLAEgXw4cOKBSpUopOjq6sJty1pQqVUoxMTGF3YzzXnp6ug4cOKCyZcsWdlMuSD7PzBRnPCOdf/Sdn/Ox//bv319kavaRI0eovbmQnp6uZxodLbDay6ywAArNyZMni0SBwtkXHR19Qd+5BoCijpp94Sno2kuwBAAAAAB4IVgCAAAAALwQLAEUa5UqVVKrVq3UvHlzJSQk6NChQ97H/PrrrzVkyJBst6elpen+++/P9/Gdc+rSpYv27duX72PkxZ49e/Tmm2+Gl33av2bNGj300EMF1TQAQDESWbPvuusu7dmzp0CPP2XKFA0ePFiS9Pzzz2vs2LHhbUOHDtXSpUsL9Hw5ee211zK9J+nVq1e+ft709HR17NhRx48fL8jmnRbBEkCxVrp0aSUmJmrZsmWKjo7W5MmTM213zuX5+YOGDRtq9OjR2W6/7LLL9O677+arvZL0j3/8Q9ddd53KlSuX72Pkxd69ezVp0qTwsk/769Wrpx9//FGbN28uqOYBAIqJyJpdsWLFTBc9z6Zdu3bpq6++UsuWLc/J+STp9ddf1+HDh8PLH374oSpUqJDn40RHR6tNmzaaPn16QTbvtJgVFsB5o8JbWwv0eHmdCbN58+Zas2aNNm3apB49euimm27S6tWrNW3aNKWkpOj555/X0aNHddVVV2ncuHGKjY3VypUrNXToUB08eFClSpXSJ598olWrVunVV1/VBx98oKSkJA0fPlySZGaaM2eOdu3apd69e2vZsmU6cuSIBg0apFWrVikqKkojR45U69atNWXKFM2dO1eHDx9Wamqqbr/9dj377LOSguKSkJAgSdq0aZN69eqlZs2aacWKFbrsssv03nvvqXTp0kpNTdXvf/977dy5U2XKlNHLL7+smjVrKjU1VQ8++KAOHTqkTp066fXXX9fWrVt14MAB3XPPPdqzZ4+OHz+uJ598Up07d9Yf/vAHpaamqlWrVmrbtq369esXbv8vf/lLjR07VnXq1JEkde7cWSNGjFDNmjU1ZMgQrV27VseOHdPQoUPVuXNnSVKHDh00ffp0PfroowX1Vw0AOMcKu2Y3adJEa9asCS+/8sormjFjho4ePaoOHTqEa+/777+vsWPHysxUr149TZgwQXPnztWYMWOUnp6uSpUqaeLEibr00kuzPdesWbPUrl278HL9+vV1991367PPPtPx48f19ttvq2bNmjp48OBpa9+hQ4c0YMAArV27Vtdee622bdumMWPGqGHDhho0aJBWrlypI0eO6I477tATTzyh8ePHa9u2berSpYsqVaqkTz/9VPXr19fChQs1duxY/fznP9eDDz4oKbizGhsbq0ceeSRTH9x+++164oknJAW1+dlnn9Wdd96Zpz7OK+5YAoCk48ePa968eapbt64kaf369erXr5+WL1+uiy++WC+++KJmzpypxYsXq2HDhho3bpzS09PVp08fjRo1SkuXLtXMmTNVunTpTMd9/fXXNWbMGCUmJmru3LmnbJ84caLMTElJSZo0aZIGDBigI0eOSJK+/fZbTZ48WUlJSZo+fbq2bNkiSVq+fLkaNGgQPkZkW8uXL69Zs2ZJkh599FGNHj1aixYt0nPPPafHHntMUjCcp3///kpKStLll18ePk5MTIz+9re/afHixZo9e7aGDRsm55yefvppXXXVVUpMTNRzzz2Xqf3du3fXjBkzJEnbtm3T9u3b1bBhQ7300ktq3bq1FixYoNmzZ2v48OE6ePCgpOCOblJSkt9fGACg2Dpx4oQWLVqkjh07SpIWLFig9evXa8GCBUpMTNQ333yjpUuXau3atRozZoxmz56tpUuX6oUXXpAUXEieP3++lixZoh49eujll1/O8XxZ664kXXLJJVq8eLEeeOCB8JDZ7GrfpEmTVKFCBX3xxRd68skntWrVqvBxnnrqKS1cuFBLly7V0qVL9d1336l///6qWrWqZs+erU8//TTTeSPrriTNnDlT3bt3P6UPVq9eHR66W7duXa1cuTKfvZ173LEEUKwdPnxYrVq1khQUmvvuu09paWmqXr26GjduLEn68ssvtW7dOt12222SpGPHjqlx48ZKTk5W1apVdeONN0rSaYemNmnSRE8++aR69eqlLl26KDY2NtP25cuX6ze/+Y0kqWbNmqpevbpSUlIkSW3atFH58uUlSbVr19bmzZtVrVo17dmzJ9NnTl1xxRW6/vrrJUkNGjTQf//7Xx04cEArVqwI39mUgucsJGnFihWaMmWKJKlnz5566qmnJAXDfp977jktXbpUJUqUUFpamnbs2JFj/3Xv3l3x8fF64oknNGPGDHXt2lVSUOTnzp0bLrZHjx7Vli1bVKtWLVWpUkXbtm3L8bgAAGSVUbPT0tJUs2ZNtW3bVlJQcxYsWKCbb75ZUvA52evXr9d3332nbt266ZJLLpEkVaxYUZK0detW9enTR9u3b1d6erquuOKKHM+7fft2Va5cOdO6Ll26SArq7uzZs8PtOF3tW7Zsmfr37y8pCHn16tULH2fGjBl6++23dfz4cW3fvl3r1q3Tddddl21bbrjhBu3cuVNpaWnauXOnKlSooGrVqmn8+PGZ+uDgwYNav369WrZsqaioKEVHR2v//v1n9fOiCZYAirWM5zWyuvjii8PfO+fUtm3bTM8ZSso0BCc7jzzyiDp16qR58+bptttu0/Tp01WqVKlctS1yv6ioqPCD91FRUTp58qRKlChx2v0OHz6skydPqnz58qf92bIzbdo07dy5U4sWLVLJkiVVv3798N3T7Fx++eWqWLGivvvuO82YMUN/+tOfJAV99u677572A76PHDlyyp1bAADOJKNmHzp0SD169NDEiRPVv39/Oec0aNAg9enTR1JQZ2JiYvTGG2+c9jhDhgzRww8/rE6dOmnJkiUaNWpUjueNiYk5pR5m1N7I+pxT7TudjRs3auzYsfrXv/6lChUq6KGHHjpj3ZWkrl276pNPPtGOHTvUvXv38Lkj+yCro0ePKiYmJlftyi+CJYDzRl6frzhXGjdurMGDB2vDhg26+uqrdfDgQaWlpSkuLk7btm3TypUrdeONN2r//v2nBKaNGzeqXr16qlevnlauXKkffvhB9evXD29v3ry5PvzwQ7Vp00YpKSnavHmz4uLitHr16mzbExcXp40bN+rqq6/Odp9y5crpiiuu0MyZM9WtWzc55/Tdd9+pfv36aty4sWbNmqX4+PhMD/Pv27dPlStXVsmSJbV48eLwBDtly5bV/v37sz1XfHy8XnnlFe3bty98lbVdu3aaMGGCRo8eLTPT6tWrdcMNN0iSUlJSws9kAgCKpsKs2WXKlNGoUaP0q1/9Sv369VO7du00cuRI9erVS7GxsUpLS1NsbKxat26te++9Vw8//LAqVaqk3bt3q2LFitq3b1/4UZD333//jOerVauWUlNTw3cDs5Nd7WvWrJlmzpyp1q1b6z//+Y++//57SdL+/ftVpkwZlStXTjt27ND8+fPDo6gyam/G3dZI8fHxevTRR/XTTz/p73//e/jckX3w448/qmTJkqpSpYp27dqlSy65RCVLlsxTP+cVz1gCwBlUrlxZ48aNU9++fdWiRQvdeuut+uGHHxQdHa233npLQ4YMUcuWLdW9e/dTrjROmDBBzZs3V4sWLVSyZEndeuutmbb369dPJ0+eVIsWLdSnTx+99tprZ7yj2b59+1zdiZwwYYL++te/qmXLlmrWrJnmzJkjKXjQf9y4cWrRooU2bNgQHsJ75513atWqVWrRooWmTp2qmjVrSgqmd2/WrJmaN28eHjYbqWvXrvr444/VrVu38LrBgwfr2LFj4XP/8Y9/DG9LTExU+/btz9h+AACyc8MNN6hevXr66KOP9Itf/EI9e/ZU+/bt1aJFC/Xr108HDhxQnTp19Nhjj6lz585q2bJleDKboUOHKiEhQW3atDltcMsqt3U3u9rXt29f7dy5U02bNtWIESNUu3ZtlStXTvXr19f111+vxo0bq1+/fmratGn4WAkJCerZs6duv/32U85Tp04dHThwQJdddpmqVq0qSaf0QUJCgg4cOCBJWrJkyTmpu+acO+snkaS9e/eemxNdoJKTk3N9Wx2nov/8nK3+27t3b/gZwgtVxnCcgrRt2zb1799fM2fOzNfrDx06pNKlS8vM9PHHH+ujjz7K1RXbgnD06FF17txZn332mS66KPOgmZz+PZQvX97ORfsuJNRdf9SO/KPv/JyP/VeUavbZqL0dOnTQ1KlT8/WRHydOnNCxY8cUExOj1NRUde3aVV999ZWio6MLtI3Zuffee/XMM8/o2muvPWVbQdZehsICQBFTtWpVJSQkaN++ffn6LMtVq1Zp8ODBcs6pfPnyGjdu3Flo5elt2bJFTz/99CmhEgCA89mIESO0ZcuWfAXLQ4cOqUuXLjp27Jicc3rppZfOWahMT09X586dTxsqCxqVHQCKoIyH9fOjRYsW4SnIz7VrrrlG11xzTaGcGwCA/GrUqFG+X1u2bFktXLiw4BqTB9HR0br77rvPybl4xhIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsARRrlSpVUqtWrdS8eXMlJCTo0KFD3sf8+uuvNWTIkGy3p6Wl6f7778/38Z1z6tKli/bt25fvY2Q1efLk8EeOTJkyRWlpaeFtjzzyiP7zn//k67hdu3bVnj17CqSNAIDiLbJm33XXXQVeX6ZMmaLBgwdLCj7zeezYseFtQ4cOLdCJ7yLfKyxZskRffPFFeFtkTc6rYcOGadGiRQXSxrwiWAIo1kqXLq3ExEQtW7ZM0dHRmjx5cqbtzjmdPHkyT8ds2LChRo8ene32yy67TO+++26+2itJ//jHP3Tdddfl66NGsvPAAw+EZ4177733tG3btvC2sWPHqnbt2vk67l133aU333yzQNoIACjeImt2xYoVz1l92bVrl7766iu1bNmywI4Z+V4hMTFRK1asCG+LrMl59Zvf/EZ/+ctfCqSNecXHjQA4b8Qm3FKgxzvwzsI87d+8eXOtWbNGmzZtUo8ePXTTTTdp9erVmjZtmlJSUvT888/r6NGjuuqqqzRu3DjFxsZq5cqVGjp0qA4ePKhSpUrpk08+0apVq/Tqq6/qgw8+UFJSkoYPHy5JMjPNmTNHu3btUu/evbVs2TIdOXJEgwYN0qpVqxQVFaWRI0eqdevWmjJliubOnavDhw8rNTVVt99+u5599llJ0ocffqiEhARJ0qZNm9SzZ081aNBAq1evVu3atTV+/HiVKVNGixYt0rBhw3TixAk1bNhQf/rTn1SqVCk988wzmjt3rqKiovSLX/xCI0aM0PPPP6/Y2FjVqFFDq1at0oMPPqiYmBjNmzdPPXv21IgRI/T1118rNTVVzz33nKTgyu6qVav04osv6oMPPtAbb7yh9PR0NWrUSC+99JKioqLUqVMndezYUb///e8L7i8WAFDoCrtmN2nSRGvWrAkvv/LKK5oxY4aOHj2qDh06hGvv+++/r7Fjx8rMVK9ePU2YMEFz587VmDFjlJ6erkqVKmnixIm69NJLsz3XrFmz1K5du/By/fr11b17d82fP18xMTF68803dfXVV2vTpk0aOHCgfvrpJ1WuXFnjxo1T9erVNXPmTL3wwgsqUaKEypUrp7lz52rJkiV69dVXNXr0aL311luKiorSBx98oNGjR2vRokWKjY3Vbbfdpv79+2vBggWSgpp/9913KykpSatWrdITTzyhgwcP6pJLLtFrr72mqlWrqkaNGtq1a5e2b9+un/3sZ3nqU1/csQQAScePH9e8efNUt25dSdL69evVr18/LV++XBdffLFefPFFzZw5U4sXL1bDhg01btw4paenq0+fPho1apSWLl2qmTNnqnTp0pmO+/rrr2vMmDFKTEzU3LlzT9k+ceJEmZmSkpI0adIkDRgwQEeOHJEkffvtt5o8ebKSkpI0ffp0bdmyRZK0fPlyNWjQIHyM5ORk9e3bVytWrFDZsmU1adIkHTlyRAMGDNBbb72lpKQkHT9+XJMmTdKuXbv06aefavny5UpKSjol8HXt2lUNGjTQxIkTlZiYmKm9d9xxhz799NPw8owZMxQfH69169Zp+vTp+vzzz5WYmKioqChNmzZNklShQgUdPXpUu3bt8v0rAgBAknTixAktWrRIHTt2lCQtWLBA69ev14IFC5SYmKhvvvlGS5cu1dq1azVmzBjNnj1bS5cu1QsvvCApuJA8f/58LVmyRD169NDLL7+c4/my1l1JKleunJKSkvTggw/qf/7nfyRJQ0tBXAEAACAASURBVIYMCQe/Xr166fHHH5ckjR49Wh9//LGWLl16yhDXK664Qn369NGAAQOUmJioFi1ahLfVrFlT6enp2rhxo6Sg7nbv3l3Hjh3TkCFD9O6772rRokW69957wxd9JemGG27Q8uXL89GzfgiWAIq1w4cPq1WrVrrllltUrVo13XfffZKk6tWrq3HjxpKkL7/8UuvWrdNtt92mVq1a6f3339fmzZuVnJysqlWr6sYbb5QUFJmLLso8EKRJkyZ68sknNX78eO3du/eU7cuXL9edd94pKSgg1atXV0pKiiSpTZs2Kl++vGJiYlS7dm1t3rxZkrRnzx6VLVs2fIxq1aqpWbNmkqQ777xTy5YtU3JysmrUqKFrr71WknTPPfcoKSlJ5cqVU6lSpTRw4EDNmjVLZcqUyXVfVa5cWVdeeaW+/PJL7dq1Sz/88IOaNWumRYsWafXq1Wrbtq1atWqlRYsWhYugJFWpUiXTM5sAAORHRs2uWbOmduzYobZt20oKguWCBQt08803q3Xr1kpJSdH69eu1ePFidevWTZdccokkqWLFipKkrVu3Kj4+Xi1atNArr7xyxnkEtm/frsqVK2da17Nnz/CfGcNYv/zyS/Xq1UuS1Lt373C4a9q0qQYMGKB33nknz4/XdO/eXTNmzJAkTZ8+XfHx8UpOTtbatWvVrVs3tWrVSmPGjNGPP/4Yfk2VKlUyPdJyrjAUFkCxlvG8RlYXX3xx+HvnnNq2batJkyZl2idyCE52HnnkEXXq1Enz5s3TbbfdpunTp6tUqVK5alvkflFRUTp+/Hj4+5MnT6pEidNfGzSzbI950UUXacGCBVq0aJE++eQTTZw4UbNnz85VeyQpPj5eM2bMUM2aNXX77bfLzOSc0913362nn376tK85cuTIKXdqAQDIq4yafejQIfXo0UMTJ05U//795ZzToEGD1KdPH0lB3YmJidEbb7xx2uMMGTJEDz/8sDp16qQlS5Zo1KhROZ43JiYmPJrodHKqu5L05z//WV999ZU+//xztWnTJk+T68THxyshIUFdunSRmemaa67RmjVrVLt2bc2bN++0rymsukuwBHDeyOvzFedK48aNNXjwYG3YsEFXX321Dh48qLS0NMXFxWnbtm1auXKlbrzxRu3fv/+UX+QbN25UvXr1VK9ePa1cuVI//PCD6tevH97evHlzffjhh2rTpo1SUlK0efNmxcXFafXq1dm2Jy4uThs3btTVV18tSdqyZYtWrFihJk2a6KOPPlKzZs0UFxenzZs3h9s8depUtWzZUgcOHNDhw4fVvn17NW3a9JShPZIUGxur/fv3n/bcXbp00UsvvaRvvvlGf/jDHyQFd1bvueceDRgwQFWqVNHu3bu1f/9+1ahRQ8457dixQzVq1MhzvwMAzl+FWbPLlCmjUaNG6Ve/+pX69eundu3aaeTIkerVq5diY2OVlpam2NhYtW7dWvfee68efvhhVapUSbt371bFihW1b98+XX755ZKUq9lXa9WqpdTUVN18883hdTNmzNDvfvc7TZ8+PTzCqUmTJvr444/Vu3dvTZs2Tc2bN5ckpaamqlGjRmrUqJHmz58ffrQlQ05196qrrlJUVJRGjx6t+Ph4ScH7gJ07d4Zr/7Fjx5SSkqI6depIklJSUtStW7c89qo/giUAnEHGA/h9+/bV0aNHJQXTeV977bV66623NGTIEB0+fFilS5fWzJkzM712woQJWrZsmcxMderU0a233pppeEq/fv00aNAgtWjRQlFRUXrttdfOeEezffv2SkxMDAfLuLg4vfnmmxo4cKBq1aqlvn37KiYmRuPGjVNCQkJ48p4HHnhAu3fv1j333BO+8jpy5MhTjn/PPfdo0KBB4cl7IlWoUEG1atXSf/7zH910002SpNq1a2vYsGHq3r27Tp48qZIlS2rMmDHhiYAaNWp0yhBgAAB83HDDDapXr54++ugj9e7dW+vWrVP79u0lBXc233zzTdWpU0ePPfaYOnfurBIlSuj666/X66+/rqFDhyohIUEVKlRQ69attWnTphzP1b59e7399tuZPipsz549atGihUqVKhUe0TR69Gg9/PDDeuWVV8LvHSTpqaee0oYNG+ScU+vWrVW/fv1Mo6U6duyo+++/X3PmzDntrPLx8fF66qmnwhedo6Oj9c477+jxxx/Xvn37dOLECT300EOqU6eOjh07ptTUVDVs2NCvg/PBnHPn5ER79+49Nye6QCUnJysuLq6wm1Fk0X9+zlb/7d27V+XLly/w455PMobjFKRt27apf//+mjlzpjZt2hSeYfZ89Pjjj6tTp05q06bNGffN6d9D+fLlcx5nhFNQd/1RO/KPvvNzPvZfUarZZ6P2dujQQVOnTlWFChVUv359LVy4MPzs5vlk9uzZWr16tYYNG5ar/Quy9jJ5DwAUMVWrVlVCQoL27dtX2E05o7p16+YqVAIAcD4bMWLEKUNYz0cnTpzQwIEDC+XcjE0CgCKoe/fukoKZaM/Xu5WSwp+3CQBAUdaoUaPw999++20htiRnhfFsZQbuWAIAAAAAvBAsARSaEiVKKD09vbCbgfNAenp6th+fAgAofNTsC09B116GwgIoNLGxseGPv7hQ7du3T+XKlSvsZpz3SpQoodjY2MJuBgAgG0WpZlN7c6egay/BEkChMTOVLVu2sJtxVu3YsUPVq1cv7GYAAOClKNVsam/hYNwRAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHjJVbA0sw5mts7MUsxs6Gm21zCzf5nZ12b2jZl1KvimAgBQPFB3AQBFzRmDpZlFSRonqaOkupLuNrO6WXYbJmmac66hpN6SXivohgIAUBxQdwEARVFu7lg2kZTinNvgnEuXNFVS1yz7OEnlQt+Xl/RjwTURAIBihboLAChyzDmX8w5mPSV1cM71Cy3fJ6mpc25gxD6XSfqHpIqSLpb0S+fcvyOPs3fv3vCJkpOTC+wHAABcmOLi4sLfly9f3gqxKecUdRcAUFh8au9FBdSGuyW97Zx7ycyaS/qrmV3nnDt5up0jG4zcSU5Opt880H9+6L/8o+9wllB3zwH+/+YffeeH/vND/xWO3AyF3SqpesRytdC6SH0lTZMk59wySTGSKhdEAwEAKGaouwCAIic3wfJLSXFmdpWZRSuYJGBWln3+K6mdJJlZHQUF7n8LsqEAABQT1F0AQJFzxmDpnDsuaaCkzyWtVTAL3Roze9bM7gjt9pikB81staT3Jf3anenhTQAAcArqLgCgKMrVM5bOuTmS5mRZNzzi++8ltSzYpgEAUDxRdwEARU1uhsICAAAAAJAtgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC85CpYmlkHM1tnZilmNjSbfe40s+/NbI2ZvVewzQQAoPig7gIAipqLzrSDmUVJGifpVklbJH1pZrOcc99H7BMn6X8ktXTO7TazS89WgwEAuJBRdwEARVFu7lg2kZTinNvgnEuXNFVS1yz7PChpnHNutyQ553YUbDMBACg2qLsAgCLnjHcsJf1c0uaI5S2SmmbZp6YkmdlSSVGSnnHOfZbdAZOTk/PYTEj0my/6zw/9l3/0Xf7ExcUVdhMKC3X3PELf5R9954f+80P/5Y9P7c1NsMztceIk3SKpmqTFZlbfObfndDsX4zcL+ZacnEy/eaD//NB/+Uff4Syh7p4D/P/NP/rOD/3nh/4rHLkZCrtVUvWI5WqhdZG2SJrlnDvmnEuV9IOCggcAAPKGugsAKHJyEyy/lBRnZleZWbSk3pJmZdlnpoKrpjKzygqG6GwowHYCAFBcUHcBAEXOGYOlc+64pIGSPpe0VtI059waM3vWzO4I7fa5pJ/M7HtJ/5I02Dn309lqNAAAFyrqLgCgKMrVM5bOuTmS5mRZNzzieydpUOgLAAB4oO4CAIqa3AyFBQAAAAAgWwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeLmoME5a4a2thXHaIq6MlEi/5R/954f+yz/6Lr/29Pl5YTcBAADkEncsAQAAAABeCJYAAAAAAC8ESwAAAACAl0J5xpLnZvIuOTlZcXFxhd2MIov+80P/5R99BwAAigPuWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvFxUGCeNTbilME5bpDUs7AYUcfSfH/ov/+i7/DvwzsLCbgIAAMgl7lgCAAAAALwQLAEAAAAAXgiWAAAAAAAvhfKMJc/N5F1ycrLi4uIKuxlFFv3nh/7LP/oOAAAUB9yxBAAAAAB4IVgCAAAAALwQLAEAAAAAXgiWAAAAAAAvBEsAAAAAgBeCJQAAAADAC8ESAAAAAOCFYAkAAAAA8EKwBAAAAAB4IVgCAAAAALwQLAEAAAAAXgiWAAAAAAAvBEsAAAAAgJdcBUsz62Bm68wsxcyG5rBfDzNzZtao4JoIAEDxQt0FABQ1ZwyWZhYlaZykjpLqSrrbzOqeZr+ykh6V9EVBNxIAgOKCugsAKIpyc8eyiaQU59wG51y6pKmSup5mv+ckvSDpSAG2DwCA4oa6CwAoci7KxT4/l7Q5YnmLpKaRO5jZjZKqO+f+bmaDz3TA5OTkPDUSAfrND/3nh/7LP/ouf+Li4gq7CYWFunseoe/yj77zQ//5of/yx6f25iZY5sjMSkj6k6Rf5/Y1xfjNQr4lJyfTbx7oPz/0X/7Rdyho1N1zh/+/+Uff+aH//NB/hSM3Q2G3SqoesVwttC5DWUnXSVpoZhslNZM0i4kEAADIF+ouAKDIyU2w/FJSnJldZWbRknpLmpWx0Tm31zlX2Tl3pXPuSknLJd3hnPvqrLQYAIALG3UXAFDknDFYOueOSxoo6XNJayVNc86tMbNnzeyOs91AAACKE+ouAKAoytUzls65OZLmZFk3PJt9b/FvFgAAxRd1FwBQ1ORmKCwAAAAAANkiWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMALwRIAAAAA4IVgCQAAAADwQrAEAAAAAHghWAIAAAAAvBAsAQAAAABeCJYAAAAAAC8ESwAAAACAF4IlAAAAAMBLroKlmXUws3VmlmJmQ0+zfZCZfW9m35jZP83sioJvKgAAxQN1FwBQ1JwxWJpZlKRxkjpKqivpbjOrm2W3ryU1cs5dL+kjSaMLuqEAABQH1F0AQFGUmzuWTSSlOOc2OOfSJU2V1DVyB+fcv5xzh0KLyyVVK9hmAgBQbFB3AQBFzkW52OfnkjZHLG+R1DSH/ftKmpvTAZOTk3NxWmRFv/mh//zQf/lH3+VPXFxcYTehsFB3zyP0Xf7Rd37oPz/0X/741N7cBMtcM7N7JTWS1Can/Yrxm4V8S05Opt880H9+6L/8o+9wNlF3zy7+/+YffeeH/vND/xWO3ATLrZKqRyxXC63LxMx+KelJSW2cc0cLpnkAABQ71F0AQJGTm2csv5QUZ2ZXmVm0pN6SZkXuYGYNJb0h6Q7n3I6CbyYAAMUGdRcAUOScMVg6545LGijpc0lrJU1zzq0xs2fN7I7Qbi9KipX0oZmtMrNZ2RwOAADkgLoLACiKcvWMpXNujqQ5WdYNj/j+lwXcLgAAii3qLgCgqMnNUFgAAAAAALJFsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAAAALwRLAAAAAIAXgiUAAAAAwAvBEgAAAADghWAJAAAAAPBCsAQAAAAAeCFYAgAAAAC8ECwBAAAAAF4IlgAAAACA/7+9uw2V9CzvAP6/TEikvmxLImJr4poawRgKpqmtoFax2Ca0BjEfDEh9yQdrUQsFwSIUq4hY3/pBwRdSjO2HRtvSrhixtBpFSdr6soqJ2G7WqGm02kS3RqsSc/XDPMphPbtnztxzzszs+f3ggWfm3LPPNdfO5p9rZs7zDDFYAgAAMMRgCQAAwBCDJQAAAEMMlgAAAAwxWAIAADDEYAkAAMAQgyUAAABDDJYAAAAMMVgCAAAwxGAJAADAEIMlAAAAQwyWAAAADDFYAgAAMMRgCQAAwBCDJQAAAEMMlgAAAAwxWAIAADDEYAkAAMAQgyUAAABDDJYAAAAMMVgCAAAwxGAJAADAEIMlAAAAQwyWAAAADDFYAgAAMMRgCQAAwBCDJQAAAEMMlgAAAAwxWAIAADDEYAkAAMAQgyUAAABDDJYAAAAMMVgCAAAwxGAJAADAEIMlAAAAQwyWAAAADDFYAgAAMMRgCQAAwBCDJQAAAEPOXnUBAMDeevDzn7bqEjbSE1ZdwAbTuzH6N0b/Fnfv9Tct/FifWAIAADBkrsGyqn6nqr5UVceq6pXb/Pzcqrph+vm/VtXhZRcKAAeF3AVg0+w4WFbVWUnenuSKJJckuaaqLjlp2bVJvt3dj0ny1iRvWHahAHAQyF0ANlF19+kXVD0pyau7+7en23+SJN39+i1rPjytubmqzk7yjSQP6y1/+IkTJ05/IAA4hUOHDtWqa9gvcheAdbDb7J3nq7C/lORrW27fOd237Zruvi/JiSTn7aYQACCJ3AVgAzl5DwAAAEPmudzIfyW5YMvtR073bbfmzukrOYeS3L11wUH6GhMADJC7AGyceT6x/PckF1fVo6vqnCTPTXLkpDVHkjx/2r86yUd6p1/eBAC2I3cB2Dg7fmLZ3fdV1UuTfDjJWUn+srtvrarXJPlUdx9Jcl2Sv6qqY0nuySwEAYBdkrsAbKK5fseyu2/s7sd29y939+um+/50CrckeVqSX5n2/767j299/Kmut1VV51XVR6vq3qp62zKe0CZa9HplVXW4qv6vqo5O2zv2u/Z1M0cvn1pVn6mq+6rq6lXUuK7m6N0fV9VtVfX5qvqXqnrUKupcVzv1b8u651RVV9Xl+1nfupvj9XfhlBefnV6DV66izv0id/eW3F0euTtG9o6RvYvbk9zt7qEts3dTb09yUZJzknwuySUnrfnDJO+Y9p+b5IZp/0FJnpzkD5K8bbSWTdwG+3c4yRdW/RzWZZuzl4cz+5+x9ya5etU1r8s2Z++enuTnpv2X/OR1aJuvf9O6hyT5eJJbkly+6rrXZZvz9feuJC+Z9i9Jcseq617zfsndvemf3N19L+XuWP9k70D/pnWyd4HeLZK7yzgr7BOTHOvu4939oyR/k+Sqk9ZcleT6af9vkzyjqqq7v9fdn0jygyXUsakW7t8+1rgpduxld9/R3Z9Pcv8qClxj8/Tuo939/enmLZmdUISZef4dJ8lrM7uQ/UH+b9525ulfJ3notH8oyV37WN+6kbtj5O7yyN0xsneM7F3cnuTuMgZL19saM9q/R08fUX+sqp6y18WuuXl6yfZ227trk3xoTyvaLDv2r6ouS3JBd39wPwvbEPO8/l6d5HlVdWeSG5O8bH9KW0tyd4zcXR65O0b2jpG9i9uT3J3nciOsr68nubC7766qX03yD1X1+O7+31UXxpmrqp6X5PIkv7nqWjZFVT0gyVuSvGDFpWyya5K8p7vfXFVPyuzENZd2t09B2E9yl5WQvbsne4ftOneX8Ynlbq63lTrF9bYOsIX7190/7O67k6S7P53Zd6Ufu+cVr695esn25updVf1WklcleVZ3/3CfatsEO/XvIUkuTXJTVd2R5DeSHHESgZ+a5/V3bZL3JUl335zkgUnO35fq1o/cHSN3l0fujpG9Y2Tv4vYkd5cxWLre1piF+1dVD6uqs5Kkqi5KcnGS4zm45ukl29uxd1X1hCTvzCzYvrmCGtfZafvX3Se6+/zuPtzdhzP7PZlndfenVlPu2pnn3+5XkzwjSarqcZkF3Lf2tcr1IXfHyN3lkbtjZO8Y2bu4vcndJZ1Z6Mok/5HZO3evmu57TWZ/eZkKeX+SY0n+LclFWx57R2bX4Lo3s+/3/szZnM70bdH+JXlOkluTHE3ymSS/t+rnsuptjl7+2vQ6+15m797fuuqa12Wbo3f/nOS/p9fb0SRHVl3zOm079e+ktTfFmel21b/Mzkj3yczOXHc0yTNXXfOa90vu7kH/5O5CvZS7Y/2TvQP9O2mt7N1F7xbJ3ZoeCAAAAAtZxldhAQAAOMAMlgAAAAwxWAIAADDEYAkAAMAQgyUAAABDDJawx6rqBVX1iWWvBQB+ltyF1TBYAgAAMMRgCQAAwBCDJSxJVb2yqm6vqu9W1W1V9exTrOuqenlVHa+q/6mqN1bVA05a86aq+nZVfbmqrthy/wur6ovTMY5X1Yv3+nkBwDqSu7BeDJawPLcneUqSQ0n+LMlfV9UjTrH22UkuT3JZkquSvGjLz349yZeSnJ/kz5NcV1U1/eybSX43yUOTvDDJW6vqsiU/DwDY4MmTYAAAAYpJREFUBHIX1ojBEpaku9/f3Xd19/3dfUOS/0zyxFMsf0N339PdX03yF0mu2fKzr3T3u7v7x0muT/KIJA+fjvHB7r69Zz6W5J8yC1UAOFDkLqwXgyUsSVX9flUdrarvVNV3klya2buf2/nalv2vJPnFLbe/8ZOd7v7+tPvg6RhXVNUtVXXPdIwrT3MMADhjyV1YLwZLWIKqelSSdyd5aZLzuvvnk3whSZ3iIRds2b8wyV1zHOPcJH+X5E1JHj4d48bTHAMAzkhyF9aPwRKW40FJOsm3ktkv+2f2zumpvKKqfqGqLkjyR0lumOMY5yQ5dzrGfdPJBZ45VDUAbCa5C2vm7FUXAGeC7r6tqt6c5OYk9yd5b5JPnuYh/5jk05mdcOA9Sa6b4xjfraqXJ3lfZkH3gSRHxioHgM0jd2H9VHevugY4UKqqk1zc3cdWXQsAnOnkLuwPX4UFAABgiMESAACAIb4KCwAAwBCfWAIAADDEYAkAAMAQgyUAAABDDJYAAAAMMVgCAAAw5P8BgrP9gG8LaW8AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 1008x576 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "precision_scores = []\n",
    "recall_scores = []\n",
    "\n",
    "for max_df in [.01, .05, .1, .2, .4, .8]:\n",
    "\n",
    "    pipeline = make_pipeline(\n",
    "        CountVectorizer(ngram_range=(4,4), min_df=3, max_df=max_df, strip_accents='ascii'),\n",
    "        MultinomialNB(fit_prior=False, alpha=1)\n",
    "    )\n",
    "\n",
    "    pipeline.fit(df_train['text'], y_train)\n",
    "    y_test_pred =  pipeline.predict(df_test['text'])\n",
    "\n",
    "    p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)\n",
    "    \n",
    "    precision_score = [f'{max_df}']\n",
    "    precision_score.extend(p)\n",
    "    \n",
    "    recall_score = [f'{max_df}']\n",
    "    recall_score.extend(r)\n",
    "    \n",
    "    precision_scores.append(precision_score)\n",
    "    recall_scores.append(recall_score)\n",
    "    \n",
    "fig, axs = plt.subplots(1, 2, figsize=(14, 8))\n",
    "\n",
    "pd.DataFrame(\n",
    "    precision_scores,\n",
    "    columns=['alpha', 'Precision(negative)',  'Precision(positive)']\n",
    ").set_index('alpha').plot(\n",
    "    title='Precision vs max_df',\n",
    "    # color='k',\n",
    "    kind='line',\n",
    "    ylim=(0,1),\n",
    "    ax=axs[0],\n",
    ")\n",
    "\n",
    "pd.DataFrame(\n",
    "    recall_scores,\n",
    "    columns=['alpha', 'Recall(negative)', 'Recall(positive)']\n",
    ").set_index('alpha').plot(\n",
    "    title='Recall vs max_df',\n",
    "    # color='k',\n",
    "    kind='line',\n",
    "    ylim=(0,1),\n",
    "    ax=axs[1],\n",
    ")\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GridSearch + Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 234,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "pipe = Pipeline(steps=[('CountVectorizer', CountVectorizer()), ('MultinomialNB', MultinomialNB())])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {},
   "outputs": [],
   "source": [
    "param_grid = {\n",
    "    'CountVectorizer__ngram_range': [(1,1), (1,2), (1,3)],\n",
    "    'MultinomialNB__alpha': [0.1, 1],\n",
    "    'MultinomialNB__fit_prior': [True, False],\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'CountVectorizer__ngram_range': (1, 3), 'MultinomialNB__alpha': 1, 'MultinomialNB__fit_prior': False}\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "search = GridSearchCV(pipe, param_grid, scoring='precision_macro', n_jobs=-1)\n",
    "search.fit(df_train['text'], y_train)\n",
    "print(search.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Macro Precision = 80.47% & Recall=80.48%\n"
     ]
    }
   ],
   "source": [
    "y_test_pred = search.predict(df_test['text'])\n",
    "\n",
    "p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred, average='macro')\n",
    "\n",
    "print(f'Macro Precision = {p:.2%} & Recall={r:.2%}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'CountVectorizer__ngram_range': (1, 3), 'MultinomialNB__alpha': 1, 'MultinomialNB__fit_prior': False}\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "search = GridSearchCV(pipe, param_grid, scoring='recall_macro', n_jobs=-1)\n",
    "search.fit(df_train['text'], y_train)\n",
    "print(search.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 238,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Macro Precision = 80.47% & Recall=80.48%\n"
     ]
    }
   ],
   "source": [
    "y_test_pred = search.predict(df_test['text'])\n",
    "\n",
    "p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred, average='macro')\n",
    "\n",
    "print(f'Macro Precision = {p:.2%} & Recall={r:.2%}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "vec = CountVectorizer(ngram_range=(1,3), min_df=3, max_df=0.2, binary=True, strip_accents='ascii')\n",
    "\n",
    "x_train = vec.fit_transform(df_train['text'])\n",
    "x_test = vec.transform(df_test['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Precision</th>\n",
       "      <th>Recall</th>\n",
       "      <th>F</th>\n",
       "      <th>Support</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.811252</td>\n",
       "      <td>0.791150</td>\n",
       "      <td>0.801075</td>\n",
       "      <td>565</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.784278</td>\n",
       "      <td>0.804878</td>\n",
       "      <td>0.794444</td>\n",
       "      <td>533</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Precision    Recall         F  Support\n",
       "0   0.811252  0.791150  0.801075      565\n",
       "1   0.784278  0.804878  0.794444      533"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import precision_recall_fscore_support \n",
    "from sklearn.naive_bayes import BernoulliNB\n",
    "\n",
    "clf = BernoulliNB(fit_prior=False)\n",
    "clf.fit(x_train, y_train)\n",
    "y_test_pred = clf.predict(x_test)\n",
    "\n",
    "p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)\n",
    "\n",
    "pd.DataFrame(\n",
    "    {\n",
    "        'Precision': p,\n",
    "        'Recall': r,\n",
    "        'F': f,\n",
    "        'Support': s,\n",
    "    },\n",
    "    index=[0,1] \n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7978142076502732"
      ]
     },
     "execution_count": 165,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "precision_recall_fscore_support(y_test, y_test_pred, average='micro')[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_vocab = pd.DataFrame(vec.vocabulary_.items(), columns=['token', 'id']).sort_values('id').set_index('id')\n",
    "\n",
    "df_vocab['positive_coef'] = clf.coef_[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>token</th>\n",
       "      <th>positive_coef</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1000</th>\n",
       "      <td>of</td>\n",
       "      <td>-1.801171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1652</th>\n",
       "      <td>was</td>\n",
       "      <td>-1.937747</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>692</th>\n",
       "      <td>in</td>\n",
       "      <td>-2.124958</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>589</th>\n",
       "      <td>great</td>\n",
       "      <td>-2.124958</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>577</th>\n",
       "      <td>good</td>\n",
       "      <td>-2.196054</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1758</th>\n",
       "      <td>with</td>\n",
       "      <td>-2.272594</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>515</th>\n",
       "      <td>for</td>\n",
       "      <td>-2.343212</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1621</th>\n",
       "      <td>very</td>\n",
       "      <td>-2.355482</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1359</th>\n",
       "      <td>that</td>\n",
       "      <td>-2.406126</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>946</th>\n",
       "      <td>my</td>\n",
       "      <td>-2.432443</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>are</td>\n",
       "      <td>-2.591048</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>623</th>\n",
       "      <td>have</td>\n",
       "      <td>-2.672394</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1020</th>\n",
       "      <td>on</td>\n",
       "      <td>-2.706880</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1274</th>\n",
       "      <td>so</td>\n",
       "      <td>-2.760947</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1811</th>\n",
       "      <td>you</td>\n",
       "      <td>-2.779639</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>240</th>\n",
       "      <td>but</td>\n",
       "      <td>-2.818106</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>488</th>\n",
       "      <td>film</td>\n",
       "      <td>-2.818106</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1007</th>\n",
       "      <td>of the</td>\n",
       "      <td>-2.965742</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>931</th>\n",
       "      <td>movie</td>\n",
       "      <td>-2.965742</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1084</th>\n",
       "      <td>phone</td>\n",
       "      <td>-2.988731</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       token  positive_coef\n",
       "id                         \n",
       "1000      of      -1.801171\n",
       "1652     was      -1.937747\n",
       "692       in      -2.124958\n",
       "589    great      -2.124958\n",
       "577     good      -2.196054\n",
       "1758    with      -2.272594\n",
       "515      for      -2.343212\n",
       "1621    very      -2.355482\n",
       "1359    that      -2.406126\n",
       "946       my      -2.432443\n",
       "119      are      -2.591048\n",
       "623     have      -2.672394\n",
       "1020      on      -2.706880\n",
       "1274      so      -2.760947\n",
       "1811     you      -2.779639\n",
       "240      but      -2.818106\n",
       "488     film      -2.818106\n",
       "1007  of the      -2.965742\n",
       "931    movie      -2.965742\n",
       "1084   phone      -2.988731"
      ]
     },
     "execution_count": 167,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_vocab.sort_values('positive_coef', ascending=False).head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Precision</th>\n",
       "      <th>Recall</th>\n",
       "      <th>F</th>\n",
       "      <th>Support</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.785467</td>\n",
       "      <td>0.803540</td>\n",
       "      <td>0.794401</td>\n",
       "      <td>565</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.786538</td>\n",
       "      <td>0.767355</td>\n",
       "      <td>0.776828</td>\n",
       "      <td>533</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Precision    Recall         F  Support\n",
       "0   0.785467  0.803540  0.794401      565\n",
       "1   0.786538  0.767355  0.776828      533"
      ]
     },
     "execution_count": 168,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import precision_recall_fscore_support \n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "clf = LogisticRegression()\n",
    "clf.fit(x_train, y_train)\n",
    "y_test_pred = clf.predict(x_test)\n",
    "\n",
    "p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)\n",
    "\n",
    "pd.DataFrame(\n",
    "    {\n",
    "        'Precision': p,\n",
    "        'Recall': r,\n",
    "        'F': f,\n",
    "        'Support': s,\n",
    "    },\n",
    "    index=[0,1] \n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7859744990892532"
      ]
     },
     "execution_count": 169,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "precision_recall_fscore_support(y_test, y_test_pred, average='micro')[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "vec = TfidfVectorizer(ngram_range=(1,3), sublinear_tf=True)\n",
    "\n",
    "x_train = vec.fit_transform(df_train['text'])\n",
    "x_test = vec.transform(df_test['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Precision</th>\n",
       "      <th>Recall</th>\n",
       "      <th>F</th>\n",
       "      <th>Support</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.780797</td>\n",
       "      <td>0.762832</td>\n",
       "      <td>0.77171</td>\n",
       "      <td>565</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.754579</td>\n",
       "      <td>0.772983</td>\n",
       "      <td>0.76367</td>\n",
       "      <td>533</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Precision    Recall        F  Support\n",
       "0   0.780797  0.762832  0.77171      565\n",
       "1   0.754579  0.772983  0.76367      533"
      ]
     },
     "execution_count": 171,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import precision_recall_fscore_support \n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "clf = MultinomialNB(fit_prior=False, alpha=0.001)\n",
    "clf.fit(x_train, y_train)\n",
    "y_test_pred = clf.predict(x_test)\n",
    "\n",
    "p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)\n",
    "\n",
    "pd.DataFrame(\n",
    "    {\n",
    "        'Precision': p,\n",
    "        'Recall': r,\n",
    "        'F': f,\n",
    "        'Support': s,\n",
    "    },\n",
    "    index=[0,1] \n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.76775956284153"
      ]
     },
     "execution_count": 172,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "precision_recall_fscore_support(y_test, y_test_pred, average='micro')[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_vocab = pd.DataFrame(vec.vocabulary_.items(), columns=['token', 'id']).sort_values('id').set_index('id')\n",
    "\n",
    "df_vocab['positive_coef'] = clf.coef_[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>token</th>\n",
       "      <th>positive_coef</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>24844</th>\n",
       "      <td>the</td>\n",
       "      <td>-5.299478</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1173</th>\n",
       "      <td>and</td>\n",
       "      <td>-5.396813</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10733</th>\n",
       "      <td>great</td>\n",
       "      <td>-5.502234</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13053</th>\n",
       "      <td>is</td>\n",
       "      <td>-5.606420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13655</th>\n",
       "      <td>it</td>\n",
       "      <td>-5.731052</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26803</th>\n",
       "      <td>this</td>\n",
       "      <td>-5.763281</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10478</th>\n",
       "      <td>good</td>\n",
       "      <td>-5.813763</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27429</th>\n",
       "      <td>to</td>\n",
       "      <td>-6.003960</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29459</th>\n",
       "      <td>was</td>\n",
       "      <td>-6.030217</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17906</th>\n",
       "      <td>of</td>\n",
       "      <td>-6.098856</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29047</th>\n",
       "      <td>very</td>\n",
       "      <td>-6.154792</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12491</th>\n",
       "      <td>in</td>\n",
       "      <td>-6.270223</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30924</th>\n",
       "      <td>with</td>\n",
       "      <td>-6.389329</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9454</th>\n",
       "      <td>for</td>\n",
       "      <td>-6.422416</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16815</th>\n",
       "      <td>my</td>\n",
       "      <td>-6.456033</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19726</th>\n",
       "      <td>phone</td>\n",
       "      <td>-6.469763</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2336</th>\n",
       "      <td>are</td>\n",
       "      <td>-6.479058</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31313</th>\n",
       "      <td>works</td>\n",
       "      <td>-6.481447</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8349</th>\n",
       "      <td>excellent</td>\n",
       "      <td>-6.510219</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22445</th>\n",
       "      <td>service</td>\n",
       "      <td>-6.605499</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           token  positive_coef\n",
       "id                             \n",
       "24844        the      -5.299478\n",
       "1173         and      -5.396813\n",
       "10733      great      -5.502234\n",
       "13053         is      -5.606420\n",
       "13655         it      -5.731052\n",
       "26803       this      -5.763281\n",
       "10478       good      -5.813763\n",
       "27429         to      -6.003960\n",
       "29459        was      -6.030217\n",
       "17906         of      -6.098856\n",
       "29047       very      -6.154792\n",
       "12491         in      -6.270223\n",
       "30924       with      -6.389329\n",
       "9454         for      -6.422416\n",
       "16815         my      -6.456033\n",
       "19726      phone      -6.469763\n",
       "2336         are      -6.479058\n",
       "31313      works      -6.481447\n",
       "8349   excellent      -6.510219\n",
       "22445    service      -6.605499"
      ]
     },
     "execution_count": 174,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_vocab.sort_values('positive_coef', ascending=False).head(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SpaCy Word Embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pip install spacy\n",
    "# python -m spacy download en_core_web_lg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "\n",
    "nlp = spacy.load('en_core_web_md')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pip install tqdm\n",
    "from tqdm import tqdm\n",
    "tqdm.pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1647/1647 [00:21<00:00, 76.70it/s] \n",
      "100%|██████████| 1098/1098 [00:10<00:00, 102.28it/s]\n"
     ]
    }
   ],
   "source": [
    "import spacy\n",
    "\n",
    "class WordEmbeddingVectorizer:\n",
    "    \n",
    "    def __init__(self, language_model='en_core_web_md'):\n",
    "        self.nlp = spacy.load(language_model)\n",
    "    \n",
    "    def fit(self):\n",
    "        pass\n",
    "    \n",
    "    def transform(self, x, y=None):\n",
    "        return pd.Series(x).progress_apply(\n",
    "            lambda doc: self.nlp(doc).vector.tolist()\n",
    "        ).values.tolist()\n",
    "    \n",
    "    def fit_transform(self, x, y=None):\n",
    "        return self.transform(x)\n",
    "    \n",
    "\n",
    "vec = WordEmbeddingVectorizer()\n",
    "x_train_w2v = vec.transform(df_train['text'])\n",
    "x_test_w2v = vec.transform(df_test['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Precision</th>\n",
       "      <th>Recall</th>\n",
       "      <th>F</th>\n",
       "      <th>Support</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.844203</td>\n",
       "      <td>0.824779</td>\n",
       "      <td>0.834378</td>\n",
       "      <td>565</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.818681</td>\n",
       "      <td>0.838649</td>\n",
       "      <td>0.828545</td>\n",
       "      <td>533</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Precision    Recall         F  Support\n",
       "0   0.844203  0.824779  0.834378      565\n",
       "1   0.818681  0.838649  0.828545      533"
      ]
     },
     "execution_count": 179,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import precision_recall_fscore_support \n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "\n",
    "scaler = StandardScaler()\n",
    "# clf = SGDClassifier(loss='log', max_iter=10000, class_weight='balanced')\n",
    "clf = LogisticRegression(max_iter=10000, class_weight=None)\n",
    "\n",
    "x_train_scaled = scaler.fit_transform(x_train_w2v)\n",
    "x_test_scaled = scaler.transform(x_test_w2v)\n",
    "\n",
    "clf.fit(x_train_scaled, y_train)\n",
    "y_test_pred = clf.predict(x_test_scaled)\n",
    "\n",
    "p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)\n",
    "\n",
    "pd.DataFrame(\n",
    "    {\n",
    "        'Precision': p,\n",
    "        'Recall': r,\n",
    "        'F': f,\n",
    "        'Support': s,\n",
    "    },\n",
    "    index=[0,1] \n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.831511839708561"
      ]
     },
     "execution_count": 180,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "precision_recall_fscore_support(y_test, y_test_pred, average='micro')[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
